diff --git a/backend/danswer/connectors/mediawiki/family.py b/backend/danswer/connectors/mediawiki/family.py index 0d9530667..163bca2ef 100644 --- a/backend/danswer/connectors/mediawiki/family.py +++ b/backend/danswer/connectors/mediawiki/family.py @@ -45,8 +45,7 @@ class FamilyFileGeneratorInMemory(generate_family_file.FamilyFileGenerator): if any(x not in generate_family_file.NAME_CHARACTERS for x in name): raise ValueError( - 'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]', - name, + f'ERROR: Name of family "{name}" must be ASCII letters and digits [a-zA-Z0-9]', ) if isinstance(dointerwiki, bool): diff --git a/backend/danswer/connectors/mediawiki/wiki.py b/backend/danswer/connectors/mediawiki/wiki.py index f4ec1e023..a3de96398 100644 --- a/backend/danswer/connectors/mediawiki/wiki.py +++ b/backend/danswer/connectors/mediawiki/wiki.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime import itertools from collections.abc import Generator +from collections.abc import Iterator from typing import Any from typing import ClassVar @@ -19,6 +20,9 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.mediawiki.family import family_class_dispatch from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() def pywikibot_timestamp_to_utc_datetime( @@ -74,7 +78,7 @@ def get_doc_from_page( sections=sections, semantic_identifier=page.title(), metadata={"categories": [category.title() for category in page.categories()]}, - id=page.pageid, + id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}", ) @@ -117,13 +121,18 @@ class MediaWikiConnector(LoadConnector, PollConnector): # short names can only have ascii letters and digits - self.family = family_class_dispatch(hostname, "Wikipedia Connector")() + self.family = family_class_dispatch(hostname, "WikipediaConnector")() self.site = pywikibot.Site(fam=self.family, code=language_code) self.categories = [ pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}") for category in categories ] - self.pages = [pywikibot.Page(self.site, page) for page in pages] + + self.pages = [] + for page in pages: + if not page: + continue + self.pages.append(pywikibot.Page(self.site, page)) def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: """Load credentials for a MediaWiki site. @@ -169,8 +178,13 @@ class MediaWikiConnector(LoadConnector, PollConnector): ] # Since we can specify both individual pages and categories, we need to iterate over all of them. - all_pages = itertools.chain(self.pages, *category_pages) + all_pages: Iterator[pywikibot.Page] = itertools.chain( + self.pages, *category_pages + ) for page in all_pages: + logger.info( + f"MediaWikiConnector: title='{page.title()}' url={page.full_url()}" + ) doc_batch.append( get_doc_from_page(page, self.site, self.document_source_type) ) diff --git a/backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py b/backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py index 2a2c841a4..9aaacfc1e 100644 --- a/backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py +++ b/backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py @@ -100,7 +100,7 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None: assert doc.metadata == { "categories": [category.title() for category in test_page.categories()] } - assert doc.id == test_page.pageid + assert doc.id == f"MEDIAWIKI_{test_page.pageid}_{test_page.full_url()}" def test_mediawiki_connector_recurse_depth() -> None: