mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-08-02 21:22:51 +02:00
Add MediaWiki and Wikipedia Connectors (#1250)
* Add MediaWikiConnector first draft * Add MediaWikiConnector first draft * Add MediaWikiConnector first draft * Add MediaWikiConnector sections for each document * Add MediaWikiConnector to constants and factory * Integrate MediaWikiConnector with connectors page * Unit tests + bug fixes * Allow adding multiple mediawikiconnectors * add wikipedia connector * add wikipedia connector to factory * improve docstrings of mediawiki connector backend * improve docstrings of mediawiki connector backend * move wikipedia and mediawiki icon locations in admin page * undo accidental commit of modified docker compose yaml
This commit is contained in:
@@ -96,6 +96,8 @@ class DocumentSource(str, Enum):
|
|||||||
SHAREPOINT = "sharepoint"
|
SHAREPOINT = "sharepoint"
|
||||||
DISCOURSE = "discourse"
|
DISCOURSE = "discourse"
|
||||||
AXERO = "axero"
|
AXERO = "axero"
|
||||||
|
MEDIAWIKI = "mediawiki"
|
||||||
|
WIKIPEDIA = "wikipedia"
|
||||||
|
|
||||||
|
|
||||||
class DocumentIndexType(str, Enum):
|
class DocumentIndexType(str, Enum):
|
||||||
|
@@ -23,6 +23,7 @@ from danswer.connectors.interfaces import LoadConnector
|
|||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
from danswer.connectors.linear.connector import LinearConnector
|
from danswer.connectors.linear.connector import LinearConnector
|
||||||
from danswer.connectors.loopio.connector import LoopioConnector
|
from danswer.connectors.loopio.connector import LoopioConnector
|
||||||
|
from danswer.connectors.mediawiki.wiki import MediaWikiConnector
|
||||||
from danswer.connectors.models import InputType
|
from danswer.connectors.models import InputType
|
||||||
from danswer.connectors.notion.connector import NotionConnector
|
from danswer.connectors.notion.connector import NotionConnector
|
||||||
from danswer.connectors.productboard.connector import ProductboardConnector
|
from danswer.connectors.productboard.connector import ProductboardConnector
|
||||||
@@ -32,6 +33,7 @@ from danswer.connectors.slab.connector import SlabConnector
|
|||||||
from danswer.connectors.slack.connector import SlackPollConnector
|
from danswer.connectors.slack.connector import SlackPollConnector
|
||||||
from danswer.connectors.slack.load_connector import SlackLoadConnector
|
from danswer.connectors.slack.load_connector import SlackLoadConnector
|
||||||
from danswer.connectors.web.connector import WebConnector
|
from danswer.connectors.web.connector import WebConnector
|
||||||
|
from danswer.connectors.wikipedia.connector import WikipediaConnector
|
||||||
from danswer.connectors.zendesk.connector import ZendeskConnector
|
from danswer.connectors.zendesk.connector import ZendeskConnector
|
||||||
from danswer.connectors.zulip.connector import ZulipConnector
|
from danswer.connectors.zulip.connector import ZulipConnector
|
||||||
|
|
||||||
@@ -74,6 +76,8 @@ def identify_connector_class(
|
|||||||
DocumentSource.SHAREPOINT: SharepointConnector,
|
DocumentSource.SHAREPOINT: SharepointConnector,
|
||||||
DocumentSource.DISCOURSE: DiscourseConnector,
|
DocumentSource.DISCOURSE: DiscourseConnector,
|
||||||
DocumentSource.AXERO: AxeroConnector,
|
DocumentSource.AXERO: AxeroConnector,
|
||||||
|
DocumentSource.MEDIAWIKI: MediaWikiConnector,
|
||||||
|
DocumentSource.WIKIPEDIA: WikipediaConnector,
|
||||||
}
|
}
|
||||||
connector_by_source = connector_map.get(source, {})
|
connector_by_source = connector_map.get(source, {})
|
||||||
|
|
||||||
|
0
backend/danswer/connectors/mediawiki/__init__.py
Normal file
0
backend/danswer/connectors/mediawiki/__init__.py
Normal file
166
backend/danswer/connectors/mediawiki/family.py
Normal file
166
backend/danswer/connectors/mediawiki/family.py
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import builtins
|
||||||
|
import functools
|
||||||
|
import itertools
|
||||||
|
from typing import Any
|
||||||
|
from unittest import mock
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from urllib.parse import urlunparse
|
||||||
|
|
||||||
|
from pywikibot import family # type: ignore[import-untyped]
|
||||||
|
from pywikibot import pagegenerators
|
||||||
|
from pywikibot.scripts import generate_family_file # type: ignore[import-untyped]
|
||||||
|
from pywikibot.scripts.generate_user_files import pywikibot # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
@mock.patch.object(
|
||||||
|
builtins, "print", lambda *args: logger.info("\t".join(map(str, args)))
|
||||||
|
)
|
||||||
|
class FamilyFileGeneratorInMemory(generate_family_file.FamilyFileGenerator):
|
||||||
|
"""A subclass of FamilyFileGenerator that writes the family file to memory instead of to disk."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
name: str,
|
||||||
|
dointerwiki: str | bool = True,
|
||||||
|
verify: str | bool = True,
|
||||||
|
):
|
||||||
|
"""Initialize the FamilyFileGeneratorInMemory."""
|
||||||
|
|
||||||
|
url_parse = urlparse(url, "https")
|
||||||
|
if not url_parse.netloc and url_parse.path:
|
||||||
|
url = urlunparse(
|
||||||
|
(url_parse.scheme, url_parse.path, url_parse.netloc, *url_parse[3:])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
url = urlunparse(url_parse)
|
||||||
|
assert isinstance(url, str)
|
||||||
|
|
||||||
|
if any(x not in generate_family_file.NAME_CHARACTERS for x in name):
|
||||||
|
raise ValueError(
|
||||||
|
'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]',
|
||||||
|
name,
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(dointerwiki, bool):
|
||||||
|
dointerwiki = "Y" if dointerwiki else "N"
|
||||||
|
assert isinstance(dointerwiki, str)
|
||||||
|
|
||||||
|
if isinstance(verify, bool):
|
||||||
|
verify = "Y" if verify else "N"
|
||||||
|
assert isinstance(verify, str)
|
||||||
|
|
||||||
|
super().__init__(url, name, dointerwiki, verify)
|
||||||
|
self.family_definition: type[family.Family] | None = None
|
||||||
|
|
||||||
|
def get_params(self) -> bool:
|
||||||
|
"""Get the parameters for the family class definition.
|
||||||
|
|
||||||
|
This override prevents the method from prompting the user for input (which would be impossible in this context).
|
||||||
|
We do all the input validation in the constructor.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def writefile(self, verify: Any) -> None:
|
||||||
|
"""Write the family file.
|
||||||
|
|
||||||
|
This overrides the method in the parent class to write the family definition to memory instead of to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verify: unused argument necessary to match the signature of the method in the parent class.
|
||||||
|
"""
|
||||||
|
code_hostname_pairs = {
|
||||||
|
f"{k}": f"{urlparse(w.server).netloc}" for k, w in self.wikis.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
code_path_pairs = {f"{k}": f"{w.scriptpath}" for k, w in self.wikis.items()}
|
||||||
|
|
||||||
|
code_protocol_pairs = {
|
||||||
|
f"{k}": f"{urlparse(w.server).scheme}" for k, w in self.wikis.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
class Family(family.Family): # noqa: D101
|
||||||
|
"""The family definition for the wiki."""
|
||||||
|
|
||||||
|
name = "%(name)s"
|
||||||
|
langs = code_hostname_pairs
|
||||||
|
|
||||||
|
def scriptpath(self, code: str) -> str:
|
||||||
|
return code_path_pairs[code]
|
||||||
|
|
||||||
|
def protocol(self, code: str) -> str:
|
||||||
|
return code_protocol_pairs[code]
|
||||||
|
|
||||||
|
self.family_definition = Family
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=None)
|
||||||
|
def generate_family_class(url: str, name: str) -> type[family.Family]:
|
||||||
|
"""Generate a family file for a given URL and name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL of the wiki.
|
||||||
|
name: The short name of the wiki (customizable by the user).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The family definition.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the family definition was not generated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
generator = FamilyFileGeneratorInMemory(url, name, "Y", "Y")
|
||||||
|
generator.run()
|
||||||
|
if generator.family_definition is None:
|
||||||
|
raise ValueError("Family definition was not generated.")
|
||||||
|
return generator.family_definition
|
||||||
|
|
||||||
|
|
||||||
|
def family_class_dispatch(url: str, name: str) -> type[family.Family]:
|
||||||
|
"""Find or generate a family class for a given URL and name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL of the wiki.
|
||||||
|
name: The short name of the wiki (customizable by the user).
|
||||||
|
|
||||||
|
"""
|
||||||
|
if "wikipedia" in url:
|
||||||
|
import pywikibot.families.wikipedia_family
|
||||||
|
|
||||||
|
return pywikibot.families.wikipedia_family.Family
|
||||||
|
# TODO: Support additional families pre-defined in `pywikibot.families.*_family.py` files
|
||||||
|
return generate_family_class(url, name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
url = "fallout.fandom.com/wiki/Fallout_Wiki"
|
||||||
|
name = "falloutfandom"
|
||||||
|
|
||||||
|
categories: list[str] = []
|
||||||
|
pages = ["Fallout: New Vegas"]
|
||||||
|
recursion_depth = 1
|
||||||
|
family_type = generate_family_class(url, name)
|
||||||
|
|
||||||
|
site = pywikibot.Site(fam=family_type(), code="en")
|
||||||
|
categories = [
|
||||||
|
pywikibot.Category(site, f"Category:{category.replace(' ', '_')}")
|
||||||
|
for category in categories
|
||||||
|
]
|
||||||
|
pages = [pywikibot.Page(site, page) for page in pages]
|
||||||
|
all_pages = itertools.chain(
|
||||||
|
pages,
|
||||||
|
*[
|
||||||
|
pagegenerators.CategorizedPageGenerator(category, recurse=recursion_depth)
|
||||||
|
for category in categories
|
||||||
|
],
|
||||||
|
)
|
||||||
|
for page in all_pages:
|
||||||
|
print(page.title())
|
||||||
|
print(page.text[:1000])
|
225
backend/danswer/connectors/mediawiki/wiki.py
Normal file
225
backend/danswer/connectors/mediawiki/wiki.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import itertools
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import Any
|
||||||
|
from typing import ClassVar
|
||||||
|
|
||||||
|
import pywikibot.time
|
||||||
|
from pywikibot import pagegenerators
|
||||||
|
from pywikibot import textlib
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
|
from danswer.connectors.interfaces import PollConnector
|
||||||
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
from danswer.connectors.mediawiki.family import family_class_dispatch
|
||||||
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.connectors.models import Section
|
||||||
|
|
||||||
|
|
||||||
|
def pywikibot_timestamp_to_utc_datetime(
|
||||||
|
timestamp: pywikibot.time.Timestamp,
|
||||||
|
) -> datetime.datetime:
|
||||||
|
"""Convert a pywikibot timestamp to a datetime object in UTC.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timestamp: The pywikibot timestamp to convert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A datetime object in UTC.
|
||||||
|
"""
|
||||||
|
return datetime.datetime.astimezone(timestamp, tz=datetime.timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_doc_from_page(
|
||||||
|
page: pywikibot.Page, site: pywikibot.Site | None, source_type: DocumentSource
|
||||||
|
) -> Document:
|
||||||
|
"""Generate Danswer Document from a MediaWiki page object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page: Page from a MediaWiki site.
|
||||||
|
site: MediaWiki site (used to parse the sections of the page using the site template, if available).
|
||||||
|
source_type: Source of the document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated document.
|
||||||
|
"""
|
||||||
|
page_text = page.text
|
||||||
|
sections_extracted: textlib.Content = textlib.extract_sections(page_text, site)
|
||||||
|
|
||||||
|
sections = [
|
||||||
|
Section(
|
||||||
|
link=f"{page.full_url()}#" + section.heading.replace(" ", "_"),
|
||||||
|
text=section.title + section.content,
|
||||||
|
)
|
||||||
|
for section in sections_extracted.sections
|
||||||
|
]
|
||||||
|
sections.append(
|
||||||
|
Section(
|
||||||
|
link=page.full_url(),
|
||||||
|
text=sections_extracted.header,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
source=source_type,
|
||||||
|
title=page.title(),
|
||||||
|
doc_updated_at=pywikibot_timestamp_to_utc_datetime(
|
||||||
|
page.latest_revision.timestamp
|
||||||
|
),
|
||||||
|
sections=sections,
|
||||||
|
semantic_identifier=page.title(),
|
||||||
|
metadata={"categories": [category.title() for category in page.categories()]},
|
||||||
|
id=page.pageid,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MediaWikiConnector(LoadConnector, PollConnector):
|
||||||
|
"""A connector for MediaWiki wikis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hostname: The hostname of the wiki.
|
||||||
|
categories: The categories to include in the index.
|
||||||
|
pages: The pages to include in the index.
|
||||||
|
recurse_depth: The depth to recurse into categories. -1 means unbounded recursion.
|
||||||
|
connector_name: The name of the connector.
|
||||||
|
language_code: The language code of the wiki.
|
||||||
|
batch_size: The batch size for loading documents.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If `recurse_depth` is not an integer greater than or equal to -1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
document_source_type: ClassVar[DocumentSource] = DocumentSource.MEDIAWIKI
|
||||||
|
"""DocumentSource type for all documents generated by instances of this class. Can be overridden for connectors
|
||||||
|
tailored for specific sites."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hostname: str,
|
||||||
|
categories: list[str],
|
||||||
|
pages: list[str],
|
||||||
|
recurse_depth: int,
|
||||||
|
connector_name: str,
|
||||||
|
language_code: str = "en",
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
) -> None:
|
||||||
|
if recurse_depth < -1:
|
||||||
|
raise ValueError(
|
||||||
|
f"recurse_depth must be an integer greater than or equal to -1. Got {recurse_depth} instead."
|
||||||
|
)
|
||||||
|
# -1 means infinite recursion, which `pywikibot` will only do with `True`
|
||||||
|
self.recurse_depth: bool | int = True if recurse_depth == -1 else recurse_depth
|
||||||
|
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
# short names can only have ascii letters and digits
|
||||||
|
self.connector_name = connector_name
|
||||||
|
connector_name = "".join(ch for ch in connector_name if ch.isalnum())
|
||||||
|
|
||||||
|
self.family = family_class_dispatch(hostname, connector_name)()
|
||||||
|
self.site = pywikibot.Site(fam=self.family, code=language_code)
|
||||||
|
self.categories = [
|
||||||
|
pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}")
|
||||||
|
for category in categories
|
||||||
|
]
|
||||||
|
self.pages = [pywikibot.Page(self.site, page) for page in pages]
|
||||||
|
|
||||||
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
"""Load credentials for a MediaWiki site.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
For most read-only operations, MediaWiki API credentials are not necessary.
|
||||||
|
This method can be overridden in the event that a particular MediaWiki site
|
||||||
|
requires credentials.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_doc_batch(
|
||||||
|
self,
|
||||||
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
|
) -> Generator[list[Document], None, None]:
|
||||||
|
"""Request batches of pages from a MediaWiki site.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start: The beginning of the time period of pages to request.
|
||||||
|
end: The end of the time period of pages to request.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Lists of Documents containing each parsed page in a batch.
|
||||||
|
"""
|
||||||
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
|
# Pywikibot can handle batching for us, including only loading page contents when we finally request them.
|
||||||
|
category_pages = [
|
||||||
|
pagegenerators.PreloadingGenerator(
|
||||||
|
pagegenerators.EdittimeFilterPageGenerator(
|
||||||
|
pagegenerators.CategorizedPageGenerator(
|
||||||
|
category, recurse=self.recurse_depth
|
||||||
|
),
|
||||||
|
last_edit_start=datetime.datetime.fromtimestamp(start)
|
||||||
|
if start
|
||||||
|
else None,
|
||||||
|
last_edit_end=datetime.datetime.fromtimestamp(end) if end else None,
|
||||||
|
),
|
||||||
|
groupsize=self.batch_size,
|
||||||
|
)
|
||||||
|
for category in self.categories
|
||||||
|
]
|
||||||
|
|
||||||
|
# Since we can specify both individual pages and categories, we need to iterate over all of them.
|
||||||
|
all_pages = itertools.chain(self.pages, *category_pages)
|
||||||
|
for page in all_pages:
|
||||||
|
doc_batch.append(
|
||||||
|
get_doc_from_page(page, self.site, self.document_source_type)
|
||||||
|
)
|
||||||
|
if len(doc_batch) >= self.batch_size:
|
||||||
|
yield doc_batch
|
||||||
|
doc_batch = []
|
||||||
|
if doc_batch:
|
||||||
|
yield doc_batch
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
|
"""Load all documents from the source.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A generator of documents.
|
||||||
|
"""
|
||||||
|
return self.poll_source(None, None)
|
||||||
|
|
||||||
|
def poll_source(
|
||||||
|
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||||
|
) -> GenerateDocumentsOutput:
|
||||||
|
"""Poll the source for new documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start: The start of the time range to poll.
|
||||||
|
end: The end of the time range to poll.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A generator of documents.
|
||||||
|
"""
|
||||||
|
return self._get_doc_batch(start, end)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
HOSTNAME = "fallout.fandom.com"
|
||||||
|
test_connector = MediaWikiConnector(
|
||||||
|
connector_name="Fallout",
|
||||||
|
hostname=HOSTNAME,
|
||||||
|
categories=["Fallout:_New_Vegas_factions"],
|
||||||
|
pages=["Fallout: New Vegas"],
|
||||||
|
recurse_depth=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_docs = list(test_connector.load_from_state())
|
||||||
|
print("All docs", all_docs)
|
||||||
|
current = datetime.datetime.now().timestamp()
|
||||||
|
one_day_ago = current - 30 * 24 * 60 * 60 # 30 days
|
||||||
|
latest_docs = list(test_connector.poll_source(one_day_ago, current))
|
||||||
|
print("Latest docs", latest_docs)
|
0
backend/danswer/connectors/wikipedia/__init__.py
Normal file
0
backend/danswer/connectors/wikipedia/__init__.py
Normal file
30
backend/danswer/connectors/wikipedia/connector.py
Normal file
30
backend/danswer/connectors/wikipedia/connector.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from typing import ClassVar
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.mediawiki import wiki
|
||||||
|
|
||||||
|
|
||||||
|
class WikipediaConnector(wiki.MediaWikiConnector):
|
||||||
|
"""Connector for Wikipedia."""
|
||||||
|
|
||||||
|
document_source_type: ClassVar[DocumentSource] = DocumentSource.WIKIPEDIA
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
categories: list[str],
|
||||||
|
pages: list[str],
|
||||||
|
recurse_depth: int,
|
||||||
|
connector_name: str,
|
||||||
|
language_code: str = "en",
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(
|
||||||
|
hostname="wikipedia.org",
|
||||||
|
categories=categories,
|
||||||
|
pages=pages,
|
||||||
|
recurse_depth=recurse_depth,
|
||||||
|
connector_name=connector_name,
|
||||||
|
language_code=language_code,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
@@ -52,6 +52,7 @@ pytest-playwright==0.3.2
|
|||||||
python-docx==1.1.0
|
python-docx==1.1.0
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
python-multipart==0.0.7
|
python-multipart==0.0.7
|
||||||
|
pywikibot==9.0.0
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
requests-oauthlib==1.3.1
|
requests-oauthlib==1.3.1
|
||||||
retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image
|
retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image
|
||||||
|
@@ -0,0 +1,75 @@
|
|||||||
|
from typing import Final
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pywikibot.families.wikipedia_family import Family as WikipediaFamily
|
||||||
|
from pywikibot.family import Family
|
||||||
|
|
||||||
|
from danswer.connectors.mediawiki import family
|
||||||
|
|
||||||
|
NON_BUILTIN_WIKIS: Final[list[tuple[str, str]]] = [
|
||||||
|
("https://fallout.fandom.com", "falloutwiki"),
|
||||||
|
("https://harrypotter.fandom.com/wiki/", "harrypotterwiki"),
|
||||||
|
("https://artofproblemsolving.com/wiki", "artofproblemsolving"),
|
||||||
|
("https://www.bogleheads.org/wiki/Main_Page", "bogleheadswiki"),
|
||||||
|
("https://bogleheads.org/wiki/Main_Page", "bogleheadswiki"),
|
||||||
|
("https://www.dandwiki.com/wiki/", "dungeonsanddragons"),
|
||||||
|
("https://wiki.factorio.com/", "factoriowiki"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Add support for more builtin family types from `pywikibot.families`.
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, name, expected",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"https://en.wikipedia.org",
|
||||||
|
"wikipedia",
|
||||||
|
WikipediaFamily,
|
||||||
|
), # Support urls with protocol
|
||||||
|
(
|
||||||
|
"wikipedia.org",
|
||||||
|
"wikipedia",
|
||||||
|
WikipediaFamily,
|
||||||
|
), # Support urls without subdomain
|
||||||
|
(
|
||||||
|
"en.wikipedia.org",
|
||||||
|
"wikipedia",
|
||||||
|
WikipediaFamily,
|
||||||
|
), # Support urls with subdomain
|
||||||
|
("m.wikipedia.org", "wikipedia", WikipediaFamily),
|
||||||
|
("de.wikipedia.org", "wikipedia", WikipediaFamily),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_family_class_dispatch_builtins(
|
||||||
|
url: str, name: str, expected: type[Family]
|
||||||
|
) -> None:
|
||||||
|
"""Test that the family class dispatch function returns the correct family class in several scenarios."""
|
||||||
|
assert family.family_class_dispatch(url, name) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("url, name", NON_BUILTIN_WIKIS)
|
||||||
|
def test_family_class_dispatch_on_non_builtins_generates_new_class_fast(
|
||||||
|
url: str, name: str
|
||||||
|
) -> None:
|
||||||
|
"""Test that using the family class dispatch function on an unknown url generates a new family class."""
|
||||||
|
with mock.patch.object(
|
||||||
|
family, "generate_family_class"
|
||||||
|
) as mock_generate_family_class:
|
||||||
|
family.family_class_dispatch(url, name)
|
||||||
|
mock_generate_family_class.assert_called_once_with(url, name)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.parametrize("url, name", NON_BUILTIN_WIKIS)
|
||||||
|
def test_family_class_dispatch_on_non_builtins_generates_new_class_slow(
|
||||||
|
url: str, name: str
|
||||||
|
) -> None:
|
||||||
|
"""Test that using the family class dispatch function on an unknown url generates a new family class.
|
||||||
|
|
||||||
|
This test is slow because it actually performs the network calls to generate the family classes.
|
||||||
|
"""
|
||||||
|
generated_family_class = family.generate_family_class(url, name)
|
||||||
|
assert issubclass(generated_family_class, Family)
|
||||||
|
dispatch_family_class = family.family_class_dispatch(url, name)
|
||||||
|
assert dispatch_family_class == generated_family_class
|
146
backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py
Normal file
146
backend/tests/unit/danswer/connectors/mediawiki/test_wiki.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
from collections.abc import Iterable
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pywikibot
|
||||||
|
from pytest_mock import MockFixture
|
||||||
|
|
||||||
|
from danswer.connectors.mediawiki import wiki
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def site() -> pywikibot.Site:
|
||||||
|
return pywikibot.Site("en", "wikipedia")
|
||||||
|
|
||||||
|
|
||||||
|
def test_pywikibot_timestamp_to_utc_datetime() -> None:
|
||||||
|
timestamp_without_tzinfo = pywikibot.Timestamp(2023, 12, 27, 15, 38, 49)
|
||||||
|
timestamp_min_timezone = timestamp_without_tzinfo.astimezone(datetime.timezone.min)
|
||||||
|
timestamp_max_timezone = timestamp_without_tzinfo.astimezone(datetime.timezone.max)
|
||||||
|
assert timestamp_min_timezone.tzinfo == datetime.timezone.min
|
||||||
|
assert timestamp_max_timezone.tzinfo == datetime.timezone.max
|
||||||
|
for timestamp in [
|
||||||
|
timestamp_without_tzinfo,
|
||||||
|
timestamp_min_timezone,
|
||||||
|
timestamp_max_timezone,
|
||||||
|
]:
|
||||||
|
dt = wiki.pywikibot_timestamp_to_utc_datetime(timestamp)
|
||||||
|
assert dt.tzinfo == datetime.timezone.utc
|
||||||
|
|
||||||
|
|
||||||
|
class MockPage(pywikibot.Page):
|
||||||
|
def __init__(
|
||||||
|
self, site: pywikibot.Site, title: str, _has_categories: bool = False
|
||||||
|
) -> None:
|
||||||
|
super().__init__(site, title)
|
||||||
|
self._has_categories = _has_categories
|
||||||
|
self.header = "This is a header"
|
||||||
|
self._sections = ["This is a section", "This is another section"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _sections_helper(self) -> list[str]:
|
||||||
|
return [
|
||||||
|
f"== Section {i} ==\n{section}\n"
|
||||||
|
for i, section in enumerate(self._sections)
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
text = self.header + "\n"
|
||||||
|
for section in self._sections_helper:
|
||||||
|
text += section
|
||||||
|
return text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pageid(self) -> str:
|
||||||
|
return "1"
|
||||||
|
|
||||||
|
def full_url(self) -> str:
|
||||||
|
return "Test URL"
|
||||||
|
|
||||||
|
def categories(
|
||||||
|
self,
|
||||||
|
with_sort_key: bool = False,
|
||||||
|
total: int | None = None,
|
||||||
|
content: bool = False,
|
||||||
|
) -> Iterable[pywikibot.Page]:
|
||||||
|
if not self._has_categories:
|
||||||
|
return []
|
||||||
|
return [
|
||||||
|
MockPage(self.site, "Test Category1"),
|
||||||
|
MockPage(self.site, "Test Category2"),
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def latest_revision(self) -> pywikibot.page.Revision:
|
||||||
|
return pywikibot.page.Revision(
|
||||||
|
timestamp=pywikibot.Timestamp(2023, 12, 27, 15, 38, 49)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_doc_from_page(site: pywikibot.Site) -> None:
|
||||||
|
test_page = MockPage(site, "Test Page", _has_categories=True)
|
||||||
|
doc = wiki.get_doc_from_page(test_page, site, wiki.DocumentSource.MEDIAWIKI)
|
||||||
|
assert doc.source == wiki.DocumentSource.MEDIAWIKI
|
||||||
|
assert doc.title == test_page.title()
|
||||||
|
assert doc.doc_updated_at == wiki.pywikibot_timestamp_to_utc_datetime(
|
||||||
|
test_page.latest_revision.timestamp
|
||||||
|
)
|
||||||
|
assert len(doc.sections) == 3
|
||||||
|
for section, expected_section in zip(
|
||||||
|
doc.sections, test_page._sections_helper + [test_page.header]
|
||||||
|
):
|
||||||
|
assert (
|
||||||
|
section.text.strip() == expected_section.strip()
|
||||||
|
) # Extra whitespace before/after is okay
|
||||||
|
assert section.link.startswith(test_page.full_url())
|
||||||
|
assert doc.semantic_identifier == test_page.title()
|
||||||
|
assert doc.metadata == {
|
||||||
|
"categories": [category.title() for category in test_page.categories()]
|
||||||
|
}
|
||||||
|
assert doc.id == test_page.pageid
|
||||||
|
|
||||||
|
|
||||||
|
def test_mediawiki_connector_recurse_depth() -> None:
|
||||||
|
"""Test that the recurse_depth parameter is parsed correctly.
|
||||||
|
|
||||||
|
-1 should be parsed as `True` (for unbounded recursion)
|
||||||
|
0 or greater should be parsed as an integer
|
||||||
|
Negative values less than -1 should raise a ValueError
|
||||||
|
|
||||||
|
This is the specification dictated by the `pywikibot` library. We do not need to test behavior beyond this.
|
||||||
|
"""
|
||||||
|
hostname = "wikipedia.org"
|
||||||
|
categories: list[str] = []
|
||||||
|
pages = ["Test Page"]
|
||||||
|
connector_name = "Test Connector"
|
||||||
|
|
||||||
|
# Recurse depth less than -1 raises ValueError
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
recurse_depth = -2
|
||||||
|
wiki.MediaWikiConnector(
|
||||||
|
hostname, categories, pages, recurse_depth, connector_name
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recurse depth of -1 gets parsed as `True`
|
||||||
|
recurse_depth = -1
|
||||||
|
connector = wiki.MediaWikiConnector(
|
||||||
|
hostname, categories, pages, recurse_depth, connector_name
|
||||||
|
)
|
||||||
|
assert connector.recurse_depth is True
|
||||||
|
|
||||||
|
# Recurse depth of 0 or greater gets parsed as an integer
|
||||||
|
recurse_depth = 0
|
||||||
|
connector = wiki.MediaWikiConnector(
|
||||||
|
hostname, categories, pages, recurse_depth, connector_name
|
||||||
|
)
|
||||||
|
assert connector.recurse_depth == recurse_depth
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_from_state_calls_poll_source_with_nones(mocker: MockFixture) -> None:
|
||||||
|
connector = wiki.MediaWikiConnector("wikipedia.org", [], [], 0, "test")
|
||||||
|
poll_source = mocker.patch.object(connector, "poll_source")
|
||||||
|
connector.load_from_state()
|
||||||
|
poll_source.assert_called_once_with(None, None)
|
43
web/public/MediaWiki.svg
Normal file
43
web/public/MediaWiki.svg
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100" viewBox="0 0 100 100">
|
||||||
|
<defs>
|
||||||
|
<linearGradient id="gradient" gradientUnits="userSpaceOnUse" x1="0" x2="100" y1="100" y2="0">
|
||||||
|
<stop offset="0%" stop-color="#0a00b2"/>
|
||||||
|
<stop offset="50%" stop-color="#ff0000"/>
|
||||||
|
<stop offset="100%" stop-color="#fffc00"/>
|
||||||
|
</linearGradient>
|
||||||
|
<style>
|
||||||
|
.petal {
|
||||||
|
opacity: 0.65;
|
||||||
|
}
|
||||||
|
.petals {
|
||||||
|
fill: url(#gradient);
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</defs>
|
||||||
|
<g id="petals" class="petals">
|
||||||
|
<path class="petal" d="M33.6,14.8a16.938,16.938,0,0,0,3.116,11.142,11.457,11.457,0,0,0,6.858,4.3,3.033,3.033,0,0,0,2.385-.713,11.924,11.924,0,0,0,3.634-10.837c-1.257-7.54-6.19-12.43-12.4-17A32.468,32.468,0,0,0,33.6,14.8Z"/>
|
||||||
|
<path class="petal" d="M25.117,20.232a16.938,16.938,0,0,0,5.893,9.956,11.457,11.457,0,0,0,7.738,2.381,3.033,3.033,0,0,0,2.119-1.306,11.924,11.924,0,0,0,.705-11.409C38.406,12.9,32.376,9.449,25.2,6.642A32.468,32.468,0,0,0,25.117,20.232Z"/>
|
||||||
|
<path class="petal" d="M18.329,27.677A16.937,16.937,0,0,0,26.6,35.769a11.457,11.457,0,0,0,8.09.3,3.033,3.033,0,0,0,1.709-1.81,11.924,11.924,0,0,0-2.271-11.2c-4.859-5.9-11.576-7.67-19.237-8.523A32.466,32.466,0,0,0,18.329,27.677Z"/>
|
||||||
|
<path class="petal" d="M13.7,36.626A16.938,16.938,0,0,0,23.781,42.3a11.457,11.457,0,0,0,7.891-1.807A3.033,3.033,0,0,0,32.854,38.3a11.924,11.924,0,0,0-5.093-10.233c-6.221-4.443-13.167-4.412-20.787-3.254A32.468,32.468,0,0,0,13.7,36.626Z"/>
|
||||||
|
<path class="petal" d="M11.543,46.468a16.938,16.938,0,0,0,11.208,2.873,11.457,11.457,0,0,0,7.155-3.788,3.034,3.034,0,0,0,.575-2.422,11.924,11.924,0,0,0-7.568-8.566C15.753,31.884,9.052,33.711,1.99,36.8A32.468,32.468,0,0,0,11.543,46.468Z"/>
|
||||||
|
<path class="petal" d="M12.008,56.532a16.938,16.938,0,0,0,11.569-.126,11.457,11.457,0,0,0,5.931-5.51,3.033,3.033,0,0,0-.072-2.488,11.924,11.924,0,0,0-9.527-6.315C12.3,41.356,6.3,44.855.279,49.669A32.467,32.467,0,0,0,12.008,56.532Z"/>
|
||||||
|
<path class="petal" d="M15.062,66.134A16.938,16.938,0,0,0,26.2,63.018a11.457,11.457,0,0,0,4.3-6.858,3.033,3.033,0,0,0-.713-2.385,11.924,11.924,0,0,0-10.837-3.634c-7.54,1.257-12.43,6.19-17,12.4A32.468,32.468,0,0,0,15.062,66.134Z"/>
|
||||||
|
<path class="petal" d="M20.5,74.618a16.938,16.938,0,0,0,9.956-5.893,11.457,11.457,0,0,0,2.381-7.738,3.033,3.033,0,0,0-1.306-2.119,11.925,11.925,0,0,0-11.409-.705c-6.958,3.166-10.4,9.2-13.212,16.376A32.466,32.466,0,0,0,20.5,74.618Z"/>
|
||||||
|
<path class="petal" d="M27.943,81.406a16.938,16.938,0,0,0,8.092-8.269,11.457,11.457,0,0,0,.3-8.09,3.033,3.033,0,0,0-1.81-1.709,11.924,11.924,0,0,0-11.2,2.271c-5.9,4.859-7.67,11.576-8.523,19.237A32.467,32.467,0,0,0,27.943,81.406Z"/>
|
||||||
|
<path class="petal" d="M36.891,86.035a16.938,16.938,0,0,0,5.676-10.082,11.457,11.457,0,0,0-1.807-7.891,3.033,3.033,0,0,0-2.191-1.182,11.924,11.924,0,0,0-10.233,5.094c-4.443,6.221-4.412,13.167-3.254,20.787A32.467,32.467,0,0,0,36.891,86.035Z"/>
|
||||||
|
<path class="petal" d="M46.733,88.191a16.937,16.937,0,0,0,2.873-11.207,11.457,11.457,0,0,0-3.788-7.155,3.033,3.033,0,0,0-2.422-.575,11.924,11.924,0,0,0-8.566,7.568c-2.681,7.159-.854,13.86,2.237,20.921A32.465,32.465,0,0,0,46.733,88.191Z"/>
|
||||||
|
<path class="petal" d="M56.8,87.726a16.937,16.937,0,0,0-.125-11.569,11.457,11.457,0,0,0-5.511-5.931,3.033,3.033,0,0,0-2.488.072,11.924,11.924,0,0,0-6.315,9.528c-0.737,7.609,2.762,13.609,7.576,19.629A32.466,32.466,0,0,0,56.8,87.726Z"/>
|
||||||
|
<path class="petal" d="M66.4,84.672A16.938,16.938,0,0,0,63.284,73.53a11.457,11.457,0,0,0-6.858-4.3,3.033,3.033,0,0,0-2.385.713,11.924,11.924,0,0,0-3.634,10.837c1.257,7.54,6.19,12.43,12.4,17A32.468,32.468,0,0,0,66.4,84.672Z"/>
|
||||||
|
<path class="petal" d="M74.883,79.237a16.937,16.937,0,0,0-5.893-9.956A11.456,11.456,0,0,0,61.252,66.9a3.033,3.033,0,0,0-2.119,1.306,11.924,11.924,0,0,0-.705,11.409c3.166,6.958,9.2,10.4,16.375,13.212A32.468,32.468,0,0,0,74.883,79.237Z"/>
|
||||||
|
<path class="petal" d="M81.671,71.792A16.938,16.938,0,0,0,73.4,63.7a11.457,11.457,0,0,0-8.09-.3,3.033,3.033,0,0,0-1.708,1.81,11.924,11.924,0,0,0,2.271,11.2c4.859,5.9,11.576,7.67,19.237,8.523A32.466,32.466,0,0,0,81.671,71.792Z"/>
|
||||||
|
<path class="petal" d="M86.3,62.843a16.938,16.938,0,0,0-10.082-5.676,11.457,11.457,0,0,0-7.891,1.807,3.033,3.033,0,0,0-1.182,2.191A11.924,11.924,0,0,0,72.239,71.4c6.221,4.443,13.167,4.412,20.787,3.254A32.467,32.467,0,0,0,86.3,62.843Z"/>
|
||||||
|
<path class="petal" d="M88.457,53a16.938,16.938,0,0,0-11.207-2.873,11.457,11.457,0,0,0-7.155,3.788,3.033,3.033,0,0,0-.574,2.422A11.925,11.925,0,0,0,77.088,64.9c7.158,2.681,13.86.854,20.921-2.237A32.467,32.467,0,0,0,88.457,53Z"/>
|
||||||
|
<path class="petal" d="M87.992,42.936a16.938,16.938,0,0,0-11.569.126,11.457,11.457,0,0,0-5.931,5.511,3.033,3.033,0,0,0,.072,2.488,11.924,11.924,0,0,0,9.527,6.315C87.7,58.113,93.7,54.614,99.721,49.8A32.468,32.468,0,0,0,87.992,42.936Z"/>
|
||||||
|
<path class="petal" d="M84.938,33.335A16.938,16.938,0,0,0,73.8,36.451a11.457,11.457,0,0,0-4.3,6.858,3.033,3.033,0,0,0,.714,2.385,11.924,11.924,0,0,0,10.837,3.634c7.54-1.257,12.43-6.19,17-12.4A32.466,32.466,0,0,0,84.938,33.335Z"/>
|
||||||
|
<path class="petal" d="M79.5,24.851a16.938,16.938,0,0,0-9.956,5.893,11.457,11.457,0,0,0-2.381,7.738A3.033,3.033,0,0,0,68.472,40.6a11.924,11.924,0,0,0,11.409.705c6.958-3.166,10.4-9.2,13.212-16.375A32.468,32.468,0,0,0,79.5,24.851Z"/>
|
||||||
|
<path class="petal" d="M72.057,18.063a16.938,16.938,0,0,0-8.092,8.269,11.457,11.457,0,0,0-.3,8.09,3.033,3.033,0,0,0,1.81,1.709,11.924,11.924,0,0,0,11.2-2.271c5.9-4.859,7.67-11.576,8.523-19.237A32.467,32.467,0,0,0,72.057,18.063Z"/>
|
||||||
|
<path class="petal" d="M63.109,13.434a16.937,16.937,0,0,0-5.676,10.082,11.457,11.457,0,0,0,1.807,7.891,3.033,3.033,0,0,0,2.191,1.182A11.924,11.924,0,0,0,71.664,27.5c4.443-6.221,4.412-13.167,3.254-20.787A32.466,32.466,0,0,0,63.109,13.434Z"/>
|
||||||
|
<path class="petal" d="M53.267,11.278a16.937,16.937,0,0,0-2.873,11.207,11.456,11.456,0,0,0,3.788,7.155,3.033,3.033,0,0,0,2.422.575,11.924,11.924,0,0,0,8.566-7.568c2.681-7.159.854-13.86-2.237-20.921A32.466,32.466,0,0,0,53.267,11.278Z"/>
|
||||||
|
<path class="petal" d="M43.2,11.743a16.938,16.938,0,0,0,.126,11.569,11.457,11.457,0,0,0,5.511,5.931,3.033,3.033,0,0,0,2.488-.072,11.924,11.924,0,0,0,6.315-9.528C58.379,12.034,54.88,6.034,50.066.014A32.468,32.468,0,0,0,43.2,11.743Z"/>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 6.0 KiB |
1
web/public/Wikipedia.svg
Normal file
1
web/public/Wikipedia.svg
Normal file
File diff suppressed because one or more lines are too long
208
web/src/app/admin/connectors/mediawiki/page.tsx
Normal file
208
web/src/app/admin/connectors/mediawiki/page.tsx
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
"use client";
|
||||||
|
|
||||||
|
import * as Yup from "yup";
|
||||||
|
import { MediaWikiIcon, TrashIcon } from "@/components/icons/icons";
|
||||||
|
import {
|
||||||
|
TextArrayField,
|
||||||
|
TextArrayFieldBuilder,
|
||||||
|
TextFormField,
|
||||||
|
} from "@/components/admin/connectors/Field";
|
||||||
|
import { HealthCheckBanner } from "@/components/health/healthcheck";
|
||||||
|
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
|
||||||
|
import {
|
||||||
|
MediaWikiCredentialJson,
|
||||||
|
MediaWikiConfig,
|
||||||
|
ConnectorIndexingStatus,
|
||||||
|
Credential,
|
||||||
|
} from "@/lib/types";
|
||||||
|
import useSWR, { useSWRConfig } from "swr";
|
||||||
|
import { fetcher } from "@/lib/fetcher";
|
||||||
|
import { LoadingAnimation } from "@/components/Loading";
|
||||||
|
import { adminDeleteCredential, linkCredential } from "@/lib/credential";
|
||||||
|
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
|
||||||
|
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
|
||||||
|
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||||
|
import { usePublicCredentials } from "@/lib/hooks";
|
||||||
|
import { AdminPageTitle } from "@/components/admin/Title";
|
||||||
|
import { Card, Text, Title } from "@tremor/react";
|
||||||
|
|
||||||
|
const Main = () => {
|
||||||
|
const { popup, setPopup } = usePopup();
|
||||||
|
|
||||||
|
const { mutate } = useSWRConfig();
|
||||||
|
const {
|
||||||
|
data: connectorIndexingStatuses,
|
||||||
|
isLoading: isConnectorIndexingStatusesLoading,
|
||||||
|
error: isConnectorIndexingStatusesError,
|
||||||
|
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
|
||||||
|
"/api/manage/admin/connector/indexing-status",
|
||||||
|
fetcher
|
||||||
|
);
|
||||||
|
const {
|
||||||
|
data: credentialsData,
|
||||||
|
isLoading: isCredentialsLoading,
|
||||||
|
error: isCredentialsError,
|
||||||
|
refreshCredentials,
|
||||||
|
} = usePublicCredentials();
|
||||||
|
|
||||||
|
if (
|
||||||
|
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
|
||||||
|
(!credentialsData && isCredentialsLoading)
|
||||||
|
) {
|
||||||
|
return <LoadingAnimation text="Loading" />;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
|
||||||
|
return <div>Failed to load connectors</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isCredentialsError || !credentialsData) {
|
||||||
|
return <div>Failed to load credentials</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mediawikiConnectorIndexingStatuses: ConnectorIndexingStatus<
|
||||||
|
MediaWikiConfig,
|
||||||
|
MediaWikiCredentialJson
|
||||||
|
>[] = connectorIndexingStatuses.filter(
|
||||||
|
(connectorIndexingStatus) =>
|
||||||
|
connectorIndexingStatus.connector.source === "mediawiki"
|
||||||
|
);
|
||||||
|
const mediawikiCredential: Credential<MediaWikiCredentialJson> | undefined =
|
||||||
|
credentialsData.find((credential) => true);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
{popup}
|
||||||
|
{mediawikiConnectorIndexingStatuses.length > 0 && (
|
||||||
|
<>
|
||||||
|
<Title className="mb-2 mt-6 ml-auto mr-auto">
|
||||||
|
MediaWiki indexing status
|
||||||
|
</Title>
|
||||||
|
<Text className="mb-2">
|
||||||
|
The latest page, chapter, book and shelf changes are fetched every
|
||||||
|
10 minutes.
|
||||||
|
</Text>
|
||||||
|
<div className="mb-2">
|
||||||
|
<ConnectorsTable<MediaWikiConfig, MediaWikiCredentialJson>
|
||||||
|
connectorIndexingStatuses={mediawikiConnectorIndexingStatuses}
|
||||||
|
liveCredential={mediawikiCredential}
|
||||||
|
getCredential={(credential) => {
|
||||||
|
return <div></div>;
|
||||||
|
}}
|
||||||
|
onCredentialLink={async (connectorId) => {
|
||||||
|
if (mediawikiCredential) {
|
||||||
|
await linkCredential(connectorId, mediawikiCredential.id);
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status");
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
onUpdate={() =>
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status")
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{mediawikiCredential && (
|
||||||
|
<>
|
||||||
|
<Card className="mt-4">
|
||||||
|
<h2 className="font-bold mb-3">Create Connection</h2>
|
||||||
|
<Text className="mb-4">
|
||||||
|
Press connect below to start the connection to your MediaWiki
|
||||||
|
instance.
|
||||||
|
</Text>
|
||||||
|
<ConnectorForm<MediaWikiConfig>
|
||||||
|
nameBuilder={(values) =>
|
||||||
|
`MediaWikiConnector-${values.connector_name}`
|
||||||
|
}
|
||||||
|
ccPairNameBuilder={(values) =>
|
||||||
|
`MediaWikiConnector-${values.connector_name}`
|
||||||
|
}
|
||||||
|
source="mediawiki"
|
||||||
|
inputType="poll"
|
||||||
|
formBodyBuilder={(values) => (
|
||||||
|
<div>
|
||||||
|
<TextFormField
|
||||||
|
name="connector_name"
|
||||||
|
label="Connector Name:"
|
||||||
|
/>
|
||||||
|
<TextFormField name="hostname" label="MediaWiki Site URL:" />
|
||||||
|
<TextFormField
|
||||||
|
name="language_code"
|
||||||
|
label="MediaWiki Site Language Code (e.g. 'en', 'sp', etc...):"
|
||||||
|
/>
|
||||||
|
{TextArrayFieldBuilder({
|
||||||
|
name: "pages",
|
||||||
|
label: "Pages to index:",
|
||||||
|
subtext:
|
||||||
|
"Specify 0 or more names of pages to index. Only specify the name of the page, not its url.",
|
||||||
|
})(values)}
|
||||||
|
{TextArrayFieldBuilder({
|
||||||
|
name: "categories",
|
||||||
|
label: "Categories to index:",
|
||||||
|
subtext:
|
||||||
|
"Specify 0 or more names of categories to index. For most MediaWiki sites, these are pages" +
|
||||||
|
" with a name of the form 'Category: XYZ', that are lists of other pages/categories. Only" +
|
||||||
|
" specify the name of the category, not its url.",
|
||||||
|
})(values)}
|
||||||
|
<TextFormField
|
||||||
|
name="recurse_depth"
|
||||||
|
label="Recursion Depth:"
|
||||||
|
type="number"
|
||||||
|
subtext="When indexing categories that have sub-categories, this will determine how may levels to index. Specify 0 to only index the category itself (i.e. no recursion). Specify -1 for unlimited recursion depth. Note, that in some rare instances, a category might contain itself in its dependencies, which will cause an infinite loop. Only use -1 if you confident that this will not happen."
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
validationSchema={Yup.object().shape({
|
||||||
|
connector_name: Yup.string().required(
|
||||||
|
"Please enter a name for your MediaWiki connector."
|
||||||
|
),
|
||||||
|
hostname: Yup.string().required(
|
||||||
|
"Please enter the base URL for your MediaWiki site"
|
||||||
|
),
|
||||||
|
language_code: Yup.string().default("en"),
|
||||||
|
categories: Yup.array().of(
|
||||||
|
Yup.string().required(
|
||||||
|
"Please enter categories to index from your MediaWiki site"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
pages: Yup.array().of(
|
||||||
|
Yup.string().required(
|
||||||
|
"Please enter pages to index from your MediaWiki site"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
recurse_depth: Yup.number().required(
|
||||||
|
"Please enter the recursion depth for your MediaWiki site."
|
||||||
|
),
|
||||||
|
})}
|
||||||
|
initialValues={{
|
||||||
|
connector_name: "",
|
||||||
|
hostname: "",
|
||||||
|
language_code: "en",
|
||||||
|
categories: [],
|
||||||
|
pages: [],
|
||||||
|
recurse_depth: 0,
|
||||||
|
}}
|
||||||
|
refreshFreq={10 * 60} // 10 minutes
|
||||||
|
credentialId={mediawikiCredential.id}
|
||||||
|
/>
|
||||||
|
</Card>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default function Page() {
|
||||||
|
return (
|
||||||
|
<div className="mx-auto container">
|
||||||
|
<div className="mb-4">
|
||||||
|
<HealthCheckBanner />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<AdminPageTitle icon={<MediaWikiIcon size={32} />} title="MediaWiki" />
|
||||||
|
|
||||||
|
<Main />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
203
web/src/app/admin/connectors/wikipedia/page.tsx
Normal file
203
web/src/app/admin/connectors/wikipedia/page.tsx
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
"use client";
|
||||||
|
|
||||||
|
import * as Yup from "yup";
|
||||||
|
import { WikipediaIcon, TrashIcon } from "@/components/icons/icons";
|
||||||
|
import {
|
||||||
|
TextArrayField,
|
||||||
|
TextArrayFieldBuilder,
|
||||||
|
TextFormField,
|
||||||
|
} from "@/components/admin/connectors/Field";
|
||||||
|
import { HealthCheckBanner } from "@/components/health/healthcheck";
|
||||||
|
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
|
||||||
|
import {
|
||||||
|
WikipediaCredentialJson,
|
||||||
|
WikipediaConfig,
|
||||||
|
ConnectorIndexingStatus,
|
||||||
|
Credential,
|
||||||
|
} from "@/lib/types";
|
||||||
|
import useSWR, { useSWRConfig } from "swr";
|
||||||
|
import { fetcher } from "@/lib/fetcher";
|
||||||
|
import { LoadingAnimation } from "@/components/Loading";
|
||||||
|
import { adminDeleteCredential, linkCredential } from "@/lib/credential";
|
||||||
|
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
|
||||||
|
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
|
||||||
|
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||||
|
import { usePublicCredentials } from "@/lib/hooks";
|
||||||
|
import { AdminPageTitle } from "@/components/admin/Title";
|
||||||
|
import { Card, Text, Title } from "@tremor/react";
|
||||||
|
|
||||||
|
const Main = () => {
|
||||||
|
const { popup, setPopup } = usePopup();
|
||||||
|
|
||||||
|
const { mutate } = useSWRConfig();
|
||||||
|
const {
|
||||||
|
data: connectorIndexingStatuses,
|
||||||
|
isLoading: isConnectorIndexingStatusesLoading,
|
||||||
|
error: isConnectorIndexingStatusesError,
|
||||||
|
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
|
||||||
|
"/api/manage/admin/connector/indexing-status",
|
||||||
|
fetcher
|
||||||
|
);
|
||||||
|
const {
|
||||||
|
data: credentialsData,
|
||||||
|
isLoading: isCredentialsLoading,
|
||||||
|
error: isCredentialsError,
|
||||||
|
refreshCredentials,
|
||||||
|
} = usePublicCredentials();
|
||||||
|
|
||||||
|
if (
|
||||||
|
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
|
||||||
|
(!credentialsData && isCredentialsLoading)
|
||||||
|
) {
|
||||||
|
return <LoadingAnimation text="Loading" />;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
|
||||||
|
return <div>Failed to load connectors</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isCredentialsError || !credentialsData) {
|
||||||
|
return <div>Failed to load credentials</div>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const wikipediaConnectorIndexingStatuses: ConnectorIndexingStatus<
|
||||||
|
WikipediaConfig,
|
||||||
|
WikipediaCredentialJson
|
||||||
|
>[] = connectorIndexingStatuses.filter(
|
||||||
|
(connectorIndexingStatus) =>
|
||||||
|
connectorIndexingStatus.connector.source === "wikipedia"
|
||||||
|
);
|
||||||
|
const wikipediaCredential: Credential<WikipediaCredentialJson> | undefined =
|
||||||
|
credentialsData.find((credential) => true);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
{popup}
|
||||||
|
{wikipediaConnectorIndexingStatuses.length > 0 && (
|
||||||
|
<>
|
||||||
|
<Title className="mb-2 mt-6 ml-auto mr-auto">
|
||||||
|
Wikipedia indexing status
|
||||||
|
</Title>
|
||||||
|
<Text className="mb-2">
|
||||||
|
The latest page, chapter, book and shelf changes are fetched every
|
||||||
|
10 minutes.
|
||||||
|
</Text>
|
||||||
|
<div className="mb-2">
|
||||||
|
<ConnectorsTable<WikipediaConfig, WikipediaCredentialJson>
|
||||||
|
connectorIndexingStatuses={wikipediaConnectorIndexingStatuses}
|
||||||
|
liveCredential={wikipediaCredential}
|
||||||
|
getCredential={(credential) => {
|
||||||
|
return <div></div>;
|
||||||
|
}}
|
||||||
|
onCredentialLink={async (connectorId) => {
|
||||||
|
if (wikipediaCredential) {
|
||||||
|
await linkCredential(connectorId, wikipediaCredential.id);
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status");
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
onUpdate={() =>
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status")
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{wikipediaCredential && (
|
||||||
|
<>
|
||||||
|
<Card className="mt-4">
|
||||||
|
<h2 className="font-bold mb-3">Create Connection</h2>
|
||||||
|
<Text className="mb-4">
|
||||||
|
Press connect below to start the connection to your Wikipedia
|
||||||
|
instance.
|
||||||
|
</Text>
|
||||||
|
<ConnectorForm<WikipediaConfig>
|
||||||
|
nameBuilder={(values) =>
|
||||||
|
`WikipediaConnector-${values.connector_name}`
|
||||||
|
}
|
||||||
|
ccPairNameBuilder={(values) =>
|
||||||
|
`WikipediaConnector-${values.connector_name}`
|
||||||
|
}
|
||||||
|
source="wikipedia"
|
||||||
|
inputType="poll"
|
||||||
|
formBodyBuilder={(values) => (
|
||||||
|
<div>
|
||||||
|
<TextFormField
|
||||||
|
name="connector_name"
|
||||||
|
label="Connector Name:"
|
||||||
|
/>
|
||||||
|
<TextFormField
|
||||||
|
name="language_code"
|
||||||
|
label="Wikipedia Site Language Code (e.g. 'en', 'sp', etc...):"
|
||||||
|
/>
|
||||||
|
{TextArrayFieldBuilder({
|
||||||
|
name: "pages",
|
||||||
|
label: "Pages to index:",
|
||||||
|
subtext:
|
||||||
|
"Specify 0 or more names of pages to index. Only specify the name of the page, not its url.",
|
||||||
|
})(values)}
|
||||||
|
{TextArrayFieldBuilder({
|
||||||
|
name: "categories",
|
||||||
|
label: "Categories to index:",
|
||||||
|
subtext:
|
||||||
|
"Specify 0 or more names of categories to index. These are pages" +
|
||||||
|
" with a name of the form 'Category: XYZ', that are lists of other pages/categories. Only" +
|
||||||
|
" specify the name of the category, not its url.",
|
||||||
|
})(values)}
|
||||||
|
<TextFormField
|
||||||
|
name="recurse_depth"
|
||||||
|
label="Recursion Depth:"
|
||||||
|
type="number"
|
||||||
|
subtext="When indexing categories that have sub-categories, this will determine how may levels to index. Specify 0 to only index the category itself (i.e. no recursion). Specify -1 for unlimited recursion depth. Note, that in some rare instances, a category might contain itself in its dependencies, which will cause an infinite loop. Only use -1 if you confident that this will not happen."
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
validationSchema={Yup.object().shape({
|
||||||
|
connector_name: Yup.string().required(
|
||||||
|
"Please enter a name for your Wikipedia connector."
|
||||||
|
),
|
||||||
|
language_code: Yup.string().default("en"),
|
||||||
|
categories: Yup.array().of(
|
||||||
|
Yup.string().required(
|
||||||
|
"Please enter categories to index from your Wikipedia site"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
pages: Yup.array().of(
|
||||||
|
Yup.string().required(
|
||||||
|
"Please enter pages to index from your Wikipedia site"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
recurse_depth: Yup.number().required(
|
||||||
|
"Please enter the recursion depth for your Wikipedia site."
|
||||||
|
),
|
||||||
|
})}
|
||||||
|
initialValues={{
|
||||||
|
connector_name: "",
|
||||||
|
language_code: "en",
|
||||||
|
categories: [],
|
||||||
|
pages: [],
|
||||||
|
recurse_depth: 0,
|
||||||
|
}}
|
||||||
|
refreshFreq={10 * 60} // 10 minutes
|
||||||
|
credentialId={wikipediaCredential.id}
|
||||||
|
/>
|
||||||
|
</Card>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default function Page() {
|
||||||
|
return (
|
||||||
|
<div className="mx-auto container">
|
||||||
|
<div className="mb-4">
|
||||||
|
<HealthCheckBanner />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<AdminPageTitle icon={<WikipediaIcon size={32} />} title="Wikipedia" />
|
||||||
|
|
||||||
|
<Main />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
@@ -52,6 +52,8 @@ import document360Icon from "../../../public/Document360.png";
|
|||||||
import googleSitesIcon from "../../../public/GoogleSites.png";
|
import googleSitesIcon from "../../../public/GoogleSites.png";
|
||||||
import zendeskIcon from "../../../public/Zendesk.svg";
|
import zendeskIcon from "../../../public/Zendesk.svg";
|
||||||
import sharepointIcon from "../../../public/Sharepoint.png";
|
import sharepointIcon from "../../../public/Sharepoint.png";
|
||||||
|
import mediawikiIcon from "../../../public/MediaWiki.svg";
|
||||||
|
import wikipediaIcon from "../../../public/Wikipedia.svg";
|
||||||
import discourseIcon from "../../../public/Discourse.png";
|
import discourseIcon from "../../../public/Discourse.png";
|
||||||
import { FaRobot } from "react-icons/fa";
|
import { FaRobot } from "react-icons/fa";
|
||||||
|
|
||||||
@@ -625,3 +627,27 @@ export const AxeroIcon = ({
|
|||||||
<Image src="/Axero.jpeg" alt="Logo" width="96" height="96" />
|
<Image src="/Axero.jpeg" alt="Logo" width="96" height="96" />
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
export const MediaWikiIcon = ({
|
||||||
|
size = 16,
|
||||||
|
className = defaultTailwindCSS,
|
||||||
|
}: IconProps) => (
|
||||||
|
<div
|
||||||
|
style={{ width: `${size}px`, height: `${size}px` }}
|
||||||
|
className={`w-[${size}px] h-[${size}px] ` + className}
|
||||||
|
>
|
||||||
|
<Image src={mediawikiIcon} alt="Logo" width="96" height="96" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
|
||||||
|
export const WikipediaIcon = ({
|
||||||
|
size = 16,
|
||||||
|
className = defaultTailwindCSS,
|
||||||
|
}: IconProps) => (
|
||||||
|
<div
|
||||||
|
style={{ width: `${size}px`, height: `${size}px` }}
|
||||||
|
className={`w-[${size}px] h-[${size}px] ` + className}
|
||||||
|
>
|
||||||
|
<Image src={wikipediaIcon} alt="Logo" width="96" height="96" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
@@ -25,6 +25,8 @@ import {
|
|||||||
SlackIcon,
|
SlackIcon,
|
||||||
ZendeskIcon,
|
ZendeskIcon,
|
||||||
ZulipIcon,
|
ZulipIcon,
|
||||||
|
MediaWikiIcon,
|
||||||
|
WikipediaIcon,
|
||||||
} from "@/components/icons/icons";
|
} from "@/components/icons/icons";
|
||||||
import { ValidSources } from "./types";
|
import { ValidSources } from "./types";
|
||||||
import { SourceCategory, SourceMetadata } from "./search/interfaces";
|
import { SourceCategory, SourceMetadata } from "./search/interfaces";
|
||||||
@@ -166,6 +168,16 @@ const SOURCE_METADATA_MAP: SourceMap = {
|
|||||||
displayName: "Axero",
|
displayName: "Axero",
|
||||||
category: SourceCategory.AppConnection,
|
category: SourceCategory.AppConnection,
|
||||||
},
|
},
|
||||||
|
wikipedia: {
|
||||||
|
icon: WikipediaIcon,
|
||||||
|
displayName: "Wikipedia",
|
||||||
|
category: SourceCategory.AppConnection,
|
||||||
|
},
|
||||||
|
mediawiki: {
|
||||||
|
icon: MediaWikiIcon,
|
||||||
|
displayName: "MediaWiki",
|
||||||
|
category: SourceCategory.AppConnection,
|
||||||
|
},
|
||||||
requesttracker: {
|
requesttracker: {
|
||||||
icon: RequestTrackerIcon,
|
icon: RequestTrackerIcon,
|
||||||
displayName: "Request Tracker",
|
displayName: "Request Tracker",
|
||||||
|
@@ -40,7 +40,9 @@ export type ValidSources =
|
|||||||
| "sharepoint"
|
| "sharepoint"
|
||||||
| "zendesk"
|
| "zendesk"
|
||||||
| "discourse"
|
| "discourse"
|
||||||
| "axero";
|
| "axero"
|
||||||
|
| "wikipedia"
|
||||||
|
| "mediawiki";
|
||||||
|
|
||||||
export type ValidInputTypes = "load_state" | "poll" | "event";
|
export type ValidInputTypes = "load_state" | "poll" | "event";
|
||||||
export type ValidStatuses =
|
export type ValidStatuses =
|
||||||
@@ -179,6 +181,19 @@ export interface GoogleSitesConfig {
|
|||||||
|
|
||||||
export interface ZendeskConfig {}
|
export interface ZendeskConfig {}
|
||||||
|
|
||||||
|
export interface MediaWikiBaseConfig {
|
||||||
|
connector_name: string;
|
||||||
|
language_code: string;
|
||||||
|
categories?: string[];
|
||||||
|
pages?: string[];
|
||||||
|
recurse_depth?: number;
|
||||||
|
}
|
||||||
|
export interface MediaWikiConfig extends MediaWikiBaseConfig {
|
||||||
|
hostname: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface WikipediaConfig extends MediaWikiBaseConfig {}
|
||||||
|
|
||||||
export interface IndexAttemptSnapshot {
|
export interface IndexAttemptSnapshot {
|
||||||
id: number;
|
id: number;
|
||||||
status: ValidStatuses | null;
|
status: ValidStatuses | null;
|
||||||
@@ -353,6 +368,9 @@ export interface AxeroCredentialJson {
|
|||||||
axero_api_token: string;
|
axero_api_token: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface MediaWikiCredentialJson {}
|
||||||
|
export interface WikipediaCredentialJson extends MediaWikiCredentialJson {}
|
||||||
|
|
||||||
// DELETION
|
// DELETION
|
||||||
|
|
||||||
export interface DeletionAttemptSnapshot {
|
export interface DeletionAttemptSnapshot {
|
||||||
|
Reference in New Issue
Block a user