Bugfix/mediawiki (#2800)

* fix formatting

* fix poorly structured doc id, fix empty page id, fix family_class_dispatch invalid name (no spaces), fix setting id with int pageid

* fix mediawiki test
This commit is contained in:
rkuo-danswer 2024-10-14 15:48:06 -07:00 committed by GitHub
parent f8a7749b46
commit dee197570d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 20 additions and 7 deletions

View File

@ -45,8 +45,7 @@ class FamilyFileGeneratorInMemory(generate_family_file.FamilyFileGenerator):
if any(x not in generate_family_file.NAME_CHARACTERS for x in name):
raise ValueError(
'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]',
name,
f'ERROR: Name of family "{name}" must be ASCII letters and digits [a-zA-Z0-9]',
)
if isinstance(dointerwiki, bool):

View File

@ -3,6 +3,7 @@ from __future__ import annotations
import datetime
import itertools
from collections.abc import Generator
from collections.abc import Iterator
from typing import Any
from typing import ClassVar
@ -19,6 +20,9 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.mediawiki.family import family_class_dispatch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
def pywikibot_timestamp_to_utc_datetime(
@ -74,7 +78,7 @@ def get_doc_from_page(
sections=sections,
semantic_identifier=page.title(),
metadata={"categories": [category.title() for category in page.categories()]},
id=page.pageid,
id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}",
)
@ -117,13 +121,18 @@ class MediaWikiConnector(LoadConnector, PollConnector):
# short names can only have ascii letters and digits
self.family = family_class_dispatch(hostname, "Wikipedia Connector")()
self.family = family_class_dispatch(hostname, "WikipediaConnector")()
self.site = pywikibot.Site(fam=self.family, code=language_code)
self.categories = [
pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}")
for category in categories
]
self.pages = [pywikibot.Page(self.site, page) for page in pages]
self.pages = []
for page in pages:
if not page:
continue
self.pages.append(pywikibot.Page(self.site, page))
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load credentials for a MediaWiki site.
@ -169,8 +178,13 @@ class MediaWikiConnector(LoadConnector, PollConnector):
]
# Since we can specify both individual pages and categories, we need to iterate over all of them.
all_pages = itertools.chain(self.pages, *category_pages)
all_pages: Iterator[pywikibot.Page] = itertools.chain(
self.pages, *category_pages
)
for page in all_pages:
logger.info(
f"MediaWikiConnector: title='{page.title()}' url={page.full_url()}"
)
doc_batch.append(
get_doc_from_page(page, self.site, self.document_source_type)
)

View File

@ -100,7 +100,7 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None:
assert doc.metadata == {
"categories": [category.title() for category in test_page.categories()]
}
assert doc.id == test_page.pageid
assert doc.id == f"MEDIAWIKI_{test_page.pageid}_{test_page.full_url()}"
def test_mediawiki_connector_recurse_depth() -> None: