mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-03 09:28:25 +02:00
Bugfix/mediawiki (#2800)
* fix formatting * fix poorly structured doc id, fix empty page id, fix family_class_dispatch invalid name (no spaces), fix setting id with int pageid * fix mediawiki test
This commit is contained in:
parent
f8a7749b46
commit
dee197570d
@ -45,8 +45,7 @@ class FamilyFileGeneratorInMemory(generate_family_file.FamilyFileGenerator):
|
||||
|
||||
if any(x not in generate_family_file.NAME_CHARACTERS for x in name):
|
||||
raise ValueError(
|
||||
'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]',
|
||||
name,
|
||||
f'ERROR: Name of family "{name}" must be ASCII letters and digits [a-zA-Z0-9]',
|
||||
)
|
||||
|
||||
if isinstance(dointerwiki, bool):
|
||||
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import datetime
|
||||
import itertools
|
||||
from collections.abc import Generator
|
||||
from collections.abc import Iterator
|
||||
from typing import Any
|
||||
from typing import ClassVar
|
||||
|
||||
@ -19,6 +20,9 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.mediawiki.family import family_class_dispatch
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def pywikibot_timestamp_to_utc_datetime(
|
||||
@ -74,7 +78,7 @@ def get_doc_from_page(
|
||||
sections=sections,
|
||||
semantic_identifier=page.title(),
|
||||
metadata={"categories": [category.title() for category in page.categories()]},
|
||||
id=page.pageid,
|
||||
id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}",
|
||||
)
|
||||
|
||||
|
||||
@ -117,13 +121,18 @@ class MediaWikiConnector(LoadConnector, PollConnector):
|
||||
|
||||
# short names can only have ascii letters and digits
|
||||
|
||||
self.family = family_class_dispatch(hostname, "Wikipedia Connector")()
|
||||
self.family = family_class_dispatch(hostname, "WikipediaConnector")()
|
||||
self.site = pywikibot.Site(fam=self.family, code=language_code)
|
||||
self.categories = [
|
||||
pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}")
|
||||
for category in categories
|
||||
]
|
||||
self.pages = [pywikibot.Page(self.site, page) for page in pages]
|
||||
|
||||
self.pages = []
|
||||
for page in pages:
|
||||
if not page:
|
||||
continue
|
||||
self.pages.append(pywikibot.Page(self.site, page))
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""Load credentials for a MediaWiki site.
|
||||
@ -169,8 +178,13 @@ class MediaWikiConnector(LoadConnector, PollConnector):
|
||||
]
|
||||
|
||||
# Since we can specify both individual pages and categories, we need to iterate over all of them.
|
||||
all_pages = itertools.chain(self.pages, *category_pages)
|
||||
all_pages: Iterator[pywikibot.Page] = itertools.chain(
|
||||
self.pages, *category_pages
|
||||
)
|
||||
for page in all_pages:
|
||||
logger.info(
|
||||
f"MediaWikiConnector: title='{page.title()}' url={page.full_url()}"
|
||||
)
|
||||
doc_batch.append(
|
||||
get_doc_from_page(page, self.site, self.document_source_type)
|
||||
)
|
||||
|
@ -100,7 +100,7 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None:
|
||||
assert doc.metadata == {
|
||||
"categories": [category.title() for category in test_page.categories()]
|
||||
}
|
||||
assert doc.id == test_page.pageid
|
||||
assert doc.id == f"MEDIAWIKI_{test_page.pageid}_{test_page.full_url()}"
|
||||
|
||||
|
||||
def test_mediawiki_connector_recurse_depth() -> None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user