Add MediaWiki and Wikipedia Connectors (#1250)

* Add MediaWikiConnector first draft

* Add MediaWikiConnector first draft

* Add MediaWikiConnector first draft

* Add MediaWikiConnector sections for each document

* Add MediaWikiConnector to constants and factory

* Integrate MediaWikiConnector with connectors page

* Unit tests + bug fixes

* Allow adding multiple mediawikiconnectors

* add wikipedia connector

* add wikipedia connector to factory

* improve docstrings of mediawiki connector backend

* improve docstrings of mediawiki connector backend

* move wikipedia and mediawiki icon locations in admin page

* undo accidental commit of modified docker compose yaml
This commit is contained in:
Andrew Sansom
2024-05-24 10:51:20 -05:00
committed by GitHub
parent 6e5d9f33d2
commit 94018e83b0
18 changed files with 1161 additions and 1 deletions

View File

@@ -96,6 +96,8 @@ class DocumentSource(str, Enum):
SHAREPOINT = "sharepoint"
DISCOURSE = "discourse"
AXERO = "axero"
MEDIAWIKI = "mediawiki"
WIKIPEDIA = "wikipedia"
class DocumentIndexType(str, Enum):

View File

@@ -23,6 +23,7 @@ from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.linear.connector import LinearConnector
from danswer.connectors.loopio.connector import LoopioConnector
from danswer.connectors.mediawiki.wiki import MediaWikiConnector
from danswer.connectors.models import InputType
from danswer.connectors.notion.connector import NotionConnector
from danswer.connectors.productboard.connector import ProductboardConnector
@@ -32,6 +33,7 @@ from danswer.connectors.slab.connector import SlabConnector
from danswer.connectors.slack.connector import SlackPollConnector
from danswer.connectors.slack.load_connector import SlackLoadConnector
from danswer.connectors.web.connector import WebConnector
from danswer.connectors.wikipedia.connector import WikipediaConnector
from danswer.connectors.zendesk.connector import ZendeskConnector
from danswer.connectors.zulip.connector import ZulipConnector
@@ -74,6 +76,8 @@ def identify_connector_class(
DocumentSource.SHAREPOINT: SharepointConnector,
DocumentSource.DISCOURSE: DiscourseConnector,
DocumentSource.AXERO: AxeroConnector,
DocumentSource.MEDIAWIKI: MediaWikiConnector,
DocumentSource.WIKIPEDIA: WikipediaConnector,
}
connector_by_source = connector_map.get(source, {})

View File

@@ -0,0 +1,166 @@
from __future__ import annotations
import builtins
import functools
import itertools
from typing import Any
from unittest import mock
from urllib.parse import urlparse
from urllib.parse import urlunparse
from pywikibot import family # type: ignore[import-untyped]
from pywikibot import pagegenerators
from pywikibot.scripts import generate_family_file # type: ignore[import-untyped]
from pywikibot.scripts.generate_user_files import pywikibot # type: ignore[import-untyped]
from danswer.utils.logger import setup_logger
logger = setup_logger()
@mock.patch.object(
builtins, "print", lambda *args: logger.info("\t".join(map(str, args)))
)
class FamilyFileGeneratorInMemory(generate_family_file.FamilyFileGenerator):
"""A subclass of FamilyFileGenerator that writes the family file to memory instead of to disk."""
def __init__(
self,
url: str,
name: str,
dointerwiki: str | bool = True,
verify: str | bool = True,
):
"""Initialize the FamilyFileGeneratorInMemory."""
url_parse = urlparse(url, "https")
if not url_parse.netloc and url_parse.path:
url = urlunparse(
(url_parse.scheme, url_parse.path, url_parse.netloc, *url_parse[3:])
)
else:
url = urlunparse(url_parse)
assert isinstance(url, str)
if any(x not in generate_family_file.NAME_CHARACTERS for x in name):
raise ValueError(
'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]',
name,
)
if isinstance(dointerwiki, bool):
dointerwiki = "Y" if dointerwiki else "N"
assert isinstance(dointerwiki, str)
if isinstance(verify, bool):
verify = "Y" if verify else "N"
assert isinstance(verify, str)
super().__init__(url, name, dointerwiki, verify)
self.family_definition: type[family.Family] | None = None
def get_params(self) -> bool:
"""Get the parameters for the family class definition.
This override prevents the method from prompting the user for input (which would be impossible in this context).
We do all the input validation in the constructor.
"""
return True
def writefile(self, verify: Any) -> None:
"""Write the family file.
This overrides the method in the parent class to write the family definition to memory instead of to disk.
Args:
verify: unused argument necessary to match the signature of the method in the parent class.
"""
code_hostname_pairs = {
f"{k}": f"{urlparse(w.server).netloc}" for k, w in self.wikis.items()
}
code_path_pairs = {f"{k}": f"{w.scriptpath}" for k, w in self.wikis.items()}
code_protocol_pairs = {
f"{k}": f"{urlparse(w.server).scheme}" for k, w in self.wikis.items()
}
class Family(family.Family): # noqa: D101
"""The family definition for the wiki."""
name = "%(name)s"
langs = code_hostname_pairs
def scriptpath(self, code: str) -> str:
return code_path_pairs[code]
def protocol(self, code: str) -> str:
return code_protocol_pairs[code]
self.family_definition = Family
@functools.lru_cache(maxsize=None)
def generate_family_class(url: str, name: str) -> type[family.Family]:
"""Generate a family file for a given URL and name.
Args:
url: The URL of the wiki.
name: The short name of the wiki (customizable by the user).
Returns:
The family definition.
Raises:
ValueError: If the family definition was not generated.
"""
generator = FamilyFileGeneratorInMemory(url, name, "Y", "Y")
generator.run()
if generator.family_definition is None:
raise ValueError("Family definition was not generated.")
return generator.family_definition
def family_class_dispatch(url: str, name: str) -> type[family.Family]:
"""Find or generate a family class for a given URL and name.
Args:
url: The URL of the wiki.
name: The short name of the wiki (customizable by the user).
"""
if "wikipedia" in url:
import pywikibot.families.wikipedia_family
return pywikibot.families.wikipedia_family.Family
# TODO: Support additional families pre-defined in `pywikibot.families.*_family.py` files
return generate_family_class(url, name)
if __name__ == "__main__":
url = "fallout.fandom.com/wiki/Fallout_Wiki"
name = "falloutfandom"
categories: list[str] = []
pages = ["Fallout: New Vegas"]
recursion_depth = 1
family_type = generate_family_class(url, name)
site = pywikibot.Site(fam=family_type(), code="en")
categories = [
pywikibot.Category(site, f"Category:{category.replace(' ', '_')}")
for category in categories
]
pages = [pywikibot.Page(site, page) for page in pages]
all_pages = itertools.chain(
pages,
*[
pagegenerators.CategorizedPageGenerator(category, recurse=recursion_depth)
for category in categories
],
)
for page in all_pages:
print(page.title())
print(page.text[:1000])

View File

@@ -0,0 +1,225 @@
from __future__ import annotations
import datetime
import itertools
from collections.abc import Generator
from typing import Any
from typing import ClassVar
import pywikibot.time
from pywikibot import pagegenerators
from pywikibot import textlib
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.mediawiki.family import family_class_dispatch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
def pywikibot_timestamp_to_utc_datetime(
timestamp: pywikibot.time.Timestamp,
) -> datetime.datetime:
"""Convert a pywikibot timestamp to a datetime object in UTC.
Args:
timestamp: The pywikibot timestamp to convert.
Returns:
A datetime object in UTC.
"""
return datetime.datetime.astimezone(timestamp, tz=datetime.timezone.utc)
def get_doc_from_page(
page: pywikibot.Page, site: pywikibot.Site | None, source_type: DocumentSource
) -> Document:
"""Generate Danswer Document from a MediaWiki page object.
Args:
page: Page from a MediaWiki site.
site: MediaWiki site (used to parse the sections of the page using the site template, if available).
source_type: Source of the document.
Returns:
Generated document.
"""
page_text = page.text
sections_extracted: textlib.Content = textlib.extract_sections(page_text, site)
sections = [
Section(
link=f"{page.full_url()}#" + section.heading.replace(" ", "_"),
text=section.title + section.content,
)
for section in sections_extracted.sections
]
sections.append(
Section(
link=page.full_url(),
text=sections_extracted.header,
)
)
return Document(
source=source_type,
title=page.title(),
doc_updated_at=pywikibot_timestamp_to_utc_datetime(
page.latest_revision.timestamp
),
sections=sections,
semantic_identifier=page.title(),
metadata={"categories": [category.title() for category in page.categories()]},
id=page.pageid,
)
class MediaWikiConnector(LoadConnector, PollConnector):
"""A connector for MediaWiki wikis.
Args:
hostname: The hostname of the wiki.
categories: The categories to include in the index.
pages: The pages to include in the index.
recurse_depth: The depth to recurse into categories. -1 means unbounded recursion.
connector_name: The name of the connector.
language_code: The language code of the wiki.
batch_size: The batch size for loading documents.
Raises:
ValueError: If `recurse_depth` is not an integer greater than or equal to -1.
"""
document_source_type: ClassVar[DocumentSource] = DocumentSource.MEDIAWIKI
"""DocumentSource type for all documents generated by instances of this class. Can be overridden for connectors
tailored for specific sites."""
def __init__(
self,
hostname: str,
categories: list[str],
pages: list[str],
recurse_depth: int,
connector_name: str,
language_code: str = "en",
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
if recurse_depth < -1:
raise ValueError(
f"recurse_depth must be an integer greater than or equal to -1. Got {recurse_depth} instead."
)
# -1 means infinite recursion, which `pywikibot` will only do with `True`
self.recurse_depth: bool | int = True if recurse_depth == -1 else recurse_depth
self.batch_size = batch_size
# short names can only have ascii letters and digits
self.connector_name = connector_name
connector_name = "".join(ch for ch in connector_name if ch.isalnum())
self.family = family_class_dispatch(hostname, connector_name)()
self.site = pywikibot.Site(fam=self.family, code=language_code)
self.categories = [
pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}")
for category in categories
]
self.pages = [pywikibot.Page(self.site, page) for page in pages]
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load credentials for a MediaWiki site.
Note:
For most read-only operations, MediaWiki API credentials are not necessary.
This method can be overridden in the event that a particular MediaWiki site
requires credentials.
"""
return None
def _get_doc_batch(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Generator[list[Document], None, None]:
"""Request batches of pages from a MediaWiki site.
Args:
start: The beginning of the time period of pages to request.
end: The end of the time period of pages to request.
Yields:
Lists of Documents containing each parsed page in a batch.
"""
doc_batch: list[Document] = []
# Pywikibot can handle batching for us, including only loading page contents when we finally request them.
category_pages = [
pagegenerators.PreloadingGenerator(
pagegenerators.EdittimeFilterPageGenerator(
pagegenerators.CategorizedPageGenerator(
category, recurse=self.recurse_depth
),
last_edit_start=datetime.datetime.fromtimestamp(start)
if start
else None,
last_edit_end=datetime.datetime.fromtimestamp(end) if end else None,
),
groupsize=self.batch_size,
)
for category in self.categories
]
# Since we can specify both individual pages and categories, we need to iterate over all of them.
all_pages = itertools.chain(self.pages, *category_pages)
for page in all_pages:
doc_batch.append(
get_doc_from_page(page, self.site, self.document_source_type)
)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
def load_from_state(self) -> GenerateDocumentsOutput:
"""Load all documents from the source.
Returns:
A generator of documents.
"""
return self.poll_source(None, None)
def poll_source(
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> GenerateDocumentsOutput:
"""Poll the source for new documents.
Args:
start: The start of the time range to poll.
end: The end of the time range to poll.
Returns:
A generator of documents.
"""
return self._get_doc_batch(start, end)
if __name__ == "__main__":
HOSTNAME = "fallout.fandom.com"
test_connector = MediaWikiConnector(
connector_name="Fallout",
hostname=HOSTNAME,
categories=["Fallout:_New_Vegas_factions"],
pages=["Fallout: New Vegas"],
recurse_depth=1,
)
all_docs = list(test_connector.load_from_state())
print("All docs", all_docs)
current = datetime.datetime.now().timestamp()
one_day_ago = current - 30 * 24 * 60 * 60 # 30 days
latest_docs = list(test_connector.poll_source(one_day_ago, current))
print("Latest docs", latest_docs)

View File

@@ -0,0 +1,30 @@
from typing import ClassVar
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.mediawiki import wiki
class WikipediaConnector(wiki.MediaWikiConnector):
"""Connector for Wikipedia."""
document_source_type: ClassVar[DocumentSource] = DocumentSource.WIKIPEDIA
def __init__(
self,
categories: list[str],
pages: list[str],
recurse_depth: int,
connector_name: str,
language_code: str = "en",
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
super().__init__(
hostname="wikipedia.org",
categories=categories,
pages=pages,
recurse_depth=recurse_depth,
connector_name=connector_name,
language_code=language_code,
batch_size=batch_size,
)

View File

@@ -52,6 +52,7 @@ pytest-playwright==0.3.2
python-docx==1.1.0
python-dotenv==1.0.0
python-multipart==0.0.7
pywikibot==9.0.0
requests==2.31.0
requests-oauthlib==1.3.1
retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image

View File

@@ -0,0 +1,75 @@
from typing import Final
from unittest import mock
import pytest
from pywikibot.families.wikipedia_family import Family as WikipediaFamily
from pywikibot.family import Family
from danswer.connectors.mediawiki import family
NON_BUILTIN_WIKIS: Final[list[tuple[str, str]]] = [
("https://fallout.fandom.com", "falloutwiki"),
("https://harrypotter.fandom.com/wiki/", "harrypotterwiki"),
("https://artofproblemsolving.com/wiki", "artofproblemsolving"),
("https://www.bogleheads.org/wiki/Main_Page", "bogleheadswiki"),
("https://bogleheads.org/wiki/Main_Page", "bogleheadswiki"),
("https://www.dandwiki.com/wiki/", "dungeonsanddragons"),
("https://wiki.factorio.com/", "factoriowiki"),
]
# TODO: Add support for more builtin family types from `pywikibot.families`.
@pytest.mark.parametrize(
"url, name, expected",
[
(
"https://en.wikipedia.org",
"wikipedia",
WikipediaFamily,
), # Support urls with protocol
(
"wikipedia.org",
"wikipedia",
WikipediaFamily,
), # Support urls without subdomain
(
"en.wikipedia.org",
"wikipedia",
WikipediaFamily,
), # Support urls with subdomain
("m.wikipedia.org", "wikipedia", WikipediaFamily),
("de.wikipedia.org", "wikipedia", WikipediaFamily),
],
)
def test_family_class_dispatch_builtins(
url: str, name: str, expected: type[Family]
) -> None:
"""Test that the family class dispatch function returns the correct family class in several scenarios."""
assert family.family_class_dispatch(url, name) == expected
@pytest.mark.parametrize("url, name", NON_BUILTIN_WIKIS)
def test_family_class_dispatch_on_non_builtins_generates_new_class_fast(
url: str, name: str
) -> None:
"""Test that using the family class dispatch function on an unknown url generates a new family class."""
with mock.patch.object(
family, "generate_family_class"
) as mock_generate_family_class:
family.family_class_dispatch(url, name)
mock_generate_family_class.assert_called_once_with(url, name)
@pytest.mark.slow
@pytest.mark.parametrize("url, name", NON_BUILTIN_WIKIS)
def test_family_class_dispatch_on_non_builtins_generates_new_class_slow(
url: str, name: str
) -> None:
"""Test that using the family class dispatch function on an unknown url generates a new family class.
This test is slow because it actually performs the network calls to generate the family classes.
"""
generated_family_class = family.generate_family_class(url, name)
assert issubclass(generated_family_class, Family)
dispatch_family_class = family.family_class_dispatch(url, name)
assert dispatch_family_class == generated_family_class

View File

@@ -0,0 +1,146 @@
from __future__ import annotations
import datetime
from collections.abc import Iterable
import pytest
import pywikibot
from pytest_mock import MockFixture
from danswer.connectors.mediawiki import wiki
@pytest.fixture
def site() -> pywikibot.Site:
return pywikibot.Site("en", "wikipedia")
def test_pywikibot_timestamp_to_utc_datetime() -> None:
timestamp_without_tzinfo = pywikibot.Timestamp(2023, 12, 27, 15, 38, 49)
timestamp_min_timezone = timestamp_without_tzinfo.astimezone(datetime.timezone.min)
timestamp_max_timezone = timestamp_without_tzinfo.astimezone(datetime.timezone.max)
assert timestamp_min_timezone.tzinfo == datetime.timezone.min
assert timestamp_max_timezone.tzinfo == datetime.timezone.max
for timestamp in [
timestamp_without_tzinfo,
timestamp_min_timezone,
timestamp_max_timezone,
]:
dt = wiki.pywikibot_timestamp_to_utc_datetime(timestamp)
assert dt.tzinfo == datetime.timezone.utc
class MockPage(pywikibot.Page):
def __init__(
self, site: pywikibot.Site, title: str, _has_categories: bool = False
) -> None:
super().__init__(site, title)
self._has_categories = _has_categories
self.header = "This is a header"
self._sections = ["This is a section", "This is another section"]
@property
def _sections_helper(self) -> list[str]:
return [
f"== Section {i} ==\n{section}\n"
for i, section in enumerate(self._sections)
]
@property
def text(self) -> str:
text = self.header + "\n"
for section in self._sections_helper:
text += section
return text
@property
def pageid(self) -> str:
return "1"
def full_url(self) -> str:
return "Test URL"
def categories(
self,
with_sort_key: bool = False,
total: int | None = None,
content: bool = False,
) -> Iterable[pywikibot.Page]:
if not self._has_categories:
return []
return [
MockPage(self.site, "Test Category1"),
MockPage(self.site, "Test Category2"),
]
@property
def latest_revision(self) -> pywikibot.page.Revision:
return pywikibot.page.Revision(
timestamp=pywikibot.Timestamp(2023, 12, 27, 15, 38, 49)
)
def test_get_doc_from_page(site: pywikibot.Site) -> None:
test_page = MockPage(site, "Test Page", _has_categories=True)
doc = wiki.get_doc_from_page(test_page, site, wiki.DocumentSource.MEDIAWIKI)
assert doc.source == wiki.DocumentSource.MEDIAWIKI
assert doc.title == test_page.title()
assert doc.doc_updated_at == wiki.pywikibot_timestamp_to_utc_datetime(
test_page.latest_revision.timestamp
)
assert len(doc.sections) == 3
for section, expected_section in zip(
doc.sections, test_page._sections_helper + [test_page.header]
):
assert (
section.text.strip() == expected_section.strip()
) # Extra whitespace before/after is okay
assert section.link.startswith(test_page.full_url())
assert doc.semantic_identifier == test_page.title()
assert doc.metadata == {
"categories": [category.title() for category in test_page.categories()]
}
assert doc.id == test_page.pageid
def test_mediawiki_connector_recurse_depth() -> None:
"""Test that the recurse_depth parameter is parsed correctly.
-1 should be parsed as `True` (for unbounded recursion)
0 or greater should be parsed as an integer
Negative values less than -1 should raise a ValueError
This is the specification dictated by the `pywikibot` library. We do not need to test behavior beyond this.
"""
hostname = "wikipedia.org"
categories: list[str] = []
pages = ["Test Page"]
connector_name = "Test Connector"
# Recurse depth less than -1 raises ValueError
with pytest.raises(ValueError):
recurse_depth = -2
wiki.MediaWikiConnector(
hostname, categories, pages, recurse_depth, connector_name
)
# Recurse depth of -1 gets parsed as `True`
recurse_depth = -1
connector = wiki.MediaWikiConnector(
hostname, categories, pages, recurse_depth, connector_name
)
assert connector.recurse_depth is True
# Recurse depth of 0 or greater gets parsed as an integer
recurse_depth = 0
connector = wiki.MediaWikiConnector(
hostname, categories, pages, recurse_depth, connector_name
)
assert connector.recurse_depth == recurse_depth
def test_load_from_state_calls_poll_source_with_nones(mocker: MockFixture) -> None:
connector = wiki.MediaWikiConnector("wikipedia.org", [], [], 0, "test")
poll_source = mocker.patch.object(connector, "poll_source")
connector.load_from_state()
poll_source.assert_called_once_with(None, None)

43
web/public/MediaWiki.svg Normal file
View File

@@ -0,0 +1,43 @@
<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100" viewBox="0 0 100 100">
<defs>
<linearGradient id="gradient" gradientUnits="userSpaceOnUse" x1="0" x2="100" y1="100" y2="0">
<stop offset="0%" stop-color="#0a00b2"/>
<stop offset="50%" stop-color="#ff0000"/>
<stop offset="100%" stop-color="#fffc00"/>
</linearGradient>
<style>
.petal {
opacity: 0.65;
}
.petals {
fill: url(#gradient);
}
</style>
</defs>
<g id="petals" class="petals">
<path class="petal" d="M33.6,14.8a16.938,16.938,0,0,0,3.116,11.142,11.457,11.457,0,0,0,6.858,4.3,3.033,3.033,0,0,0,2.385-.713,11.924,11.924,0,0,0,3.634-10.837c-1.257-7.54-6.19-12.43-12.4-17A32.468,32.468,0,0,0,33.6,14.8Z"/>
<path class="petal" d="M25.117,20.232a16.938,16.938,0,0,0,5.893,9.956,11.457,11.457,0,0,0,7.738,2.381,3.033,3.033,0,0,0,2.119-1.306,11.924,11.924,0,0,0,.705-11.409C38.406,12.9,32.376,9.449,25.2,6.642A32.468,32.468,0,0,0,25.117,20.232Z"/>
<path class="petal" d="M18.329,27.677A16.937,16.937,0,0,0,26.6,35.769a11.457,11.457,0,0,0,8.09.3,3.033,3.033,0,0,0,1.709-1.81,11.924,11.924,0,0,0-2.271-11.2c-4.859-5.9-11.576-7.67-19.237-8.523A32.466,32.466,0,0,0,18.329,27.677Z"/>
<path class="petal" d="M13.7,36.626A16.938,16.938,0,0,0,23.781,42.3a11.457,11.457,0,0,0,7.891-1.807A3.033,3.033,0,0,0,32.854,38.3a11.924,11.924,0,0,0-5.093-10.233c-6.221-4.443-13.167-4.412-20.787-3.254A32.468,32.468,0,0,0,13.7,36.626Z"/>
<path class="petal" d="M11.543,46.468a16.938,16.938,0,0,0,11.208,2.873,11.457,11.457,0,0,0,7.155-3.788,3.034,3.034,0,0,0,.575-2.422,11.924,11.924,0,0,0-7.568-8.566C15.753,31.884,9.052,33.711,1.99,36.8A32.468,32.468,0,0,0,11.543,46.468Z"/>
<path class="petal" d="M12.008,56.532a16.938,16.938,0,0,0,11.569-.126,11.457,11.457,0,0,0,5.931-5.51,3.033,3.033,0,0,0-.072-2.488,11.924,11.924,0,0,0-9.527-6.315C12.3,41.356,6.3,44.855.279,49.669A32.467,32.467,0,0,0,12.008,56.532Z"/>
<path class="petal" d="M15.062,66.134A16.938,16.938,0,0,0,26.2,63.018a11.457,11.457,0,0,0,4.3-6.858,3.033,3.033,0,0,0-.713-2.385,11.924,11.924,0,0,0-10.837-3.634c-7.54,1.257-12.43,6.19-17,12.4A32.468,32.468,0,0,0,15.062,66.134Z"/>
<path class="petal" d="M20.5,74.618a16.938,16.938,0,0,0,9.956-5.893,11.457,11.457,0,0,0,2.381-7.738,3.033,3.033,0,0,0-1.306-2.119,11.925,11.925,0,0,0-11.409-.705c-6.958,3.166-10.4,9.2-13.212,16.376A32.466,32.466,0,0,0,20.5,74.618Z"/>
<path class="petal" d="M27.943,81.406a16.938,16.938,0,0,0,8.092-8.269,11.457,11.457,0,0,0,.3-8.09,3.033,3.033,0,0,0-1.81-1.709,11.924,11.924,0,0,0-11.2,2.271c-5.9,4.859-7.67,11.576-8.523,19.237A32.467,32.467,0,0,0,27.943,81.406Z"/>
<path class="petal" d="M36.891,86.035a16.938,16.938,0,0,0,5.676-10.082,11.457,11.457,0,0,0-1.807-7.891,3.033,3.033,0,0,0-2.191-1.182,11.924,11.924,0,0,0-10.233,5.094c-4.443,6.221-4.412,13.167-3.254,20.787A32.467,32.467,0,0,0,36.891,86.035Z"/>
<path class="petal" d="M46.733,88.191a16.937,16.937,0,0,0,2.873-11.207,11.457,11.457,0,0,0-3.788-7.155,3.033,3.033,0,0,0-2.422-.575,11.924,11.924,0,0,0-8.566,7.568c-2.681,7.159-.854,13.86,2.237,20.921A32.465,32.465,0,0,0,46.733,88.191Z"/>
<path class="petal" d="M56.8,87.726a16.937,16.937,0,0,0-.125-11.569,11.457,11.457,0,0,0-5.511-5.931,3.033,3.033,0,0,0-2.488.072,11.924,11.924,0,0,0-6.315,9.528c-0.737,7.609,2.762,13.609,7.576,19.629A32.466,32.466,0,0,0,56.8,87.726Z"/>
<path class="petal" d="M66.4,84.672A16.938,16.938,0,0,0,63.284,73.53a11.457,11.457,0,0,0-6.858-4.3,3.033,3.033,0,0,0-2.385.713,11.924,11.924,0,0,0-3.634,10.837c1.257,7.54,6.19,12.43,12.4,17A32.468,32.468,0,0,0,66.4,84.672Z"/>
<path class="petal" d="M74.883,79.237a16.937,16.937,0,0,0-5.893-9.956A11.456,11.456,0,0,0,61.252,66.9a3.033,3.033,0,0,0-2.119,1.306,11.924,11.924,0,0,0-.705,11.409c3.166,6.958,9.2,10.4,16.375,13.212A32.468,32.468,0,0,0,74.883,79.237Z"/>
<path class="petal" d="M81.671,71.792A16.938,16.938,0,0,0,73.4,63.7a11.457,11.457,0,0,0-8.09-.3,3.033,3.033,0,0,0-1.708,1.81,11.924,11.924,0,0,0,2.271,11.2c4.859,5.9,11.576,7.67,19.237,8.523A32.466,32.466,0,0,0,81.671,71.792Z"/>
<path class="petal" d="M86.3,62.843a16.938,16.938,0,0,0-10.082-5.676,11.457,11.457,0,0,0-7.891,1.807,3.033,3.033,0,0,0-1.182,2.191A11.924,11.924,0,0,0,72.239,71.4c6.221,4.443,13.167,4.412,20.787,3.254A32.467,32.467,0,0,0,86.3,62.843Z"/>
<path class="petal" d="M88.457,53a16.938,16.938,0,0,0-11.207-2.873,11.457,11.457,0,0,0-7.155,3.788,3.033,3.033,0,0,0-.574,2.422A11.925,11.925,0,0,0,77.088,64.9c7.158,2.681,13.86.854,20.921-2.237A32.467,32.467,0,0,0,88.457,53Z"/>
<path class="petal" d="M87.992,42.936a16.938,16.938,0,0,0-11.569.126,11.457,11.457,0,0,0-5.931,5.511,3.033,3.033,0,0,0,.072,2.488,11.924,11.924,0,0,0,9.527,6.315C87.7,58.113,93.7,54.614,99.721,49.8A32.468,32.468,0,0,0,87.992,42.936Z"/>
<path class="petal" d="M84.938,33.335A16.938,16.938,0,0,0,73.8,36.451a11.457,11.457,0,0,0-4.3,6.858,3.033,3.033,0,0,0,.714,2.385,11.924,11.924,0,0,0,10.837,3.634c7.54-1.257,12.43-6.19,17-12.4A32.466,32.466,0,0,0,84.938,33.335Z"/>
<path class="petal" d="M79.5,24.851a16.938,16.938,0,0,0-9.956,5.893,11.457,11.457,0,0,0-2.381,7.738A3.033,3.033,0,0,0,68.472,40.6a11.924,11.924,0,0,0,11.409.705c6.958-3.166,10.4-9.2,13.212-16.375A32.468,32.468,0,0,0,79.5,24.851Z"/>
<path class="petal" d="M72.057,18.063a16.938,16.938,0,0,0-8.092,8.269,11.457,11.457,0,0,0-.3,8.09,3.033,3.033,0,0,0,1.81,1.709,11.924,11.924,0,0,0,11.2-2.271c5.9-4.859,7.67-11.576,8.523-19.237A32.467,32.467,0,0,0,72.057,18.063Z"/>
<path class="petal" d="M63.109,13.434a16.937,16.937,0,0,0-5.676,10.082,11.457,11.457,0,0,0,1.807,7.891,3.033,3.033,0,0,0,2.191,1.182A11.924,11.924,0,0,0,71.664,27.5c4.443-6.221,4.412-13.167,3.254-20.787A32.466,32.466,0,0,0,63.109,13.434Z"/>
<path class="petal" d="M53.267,11.278a16.937,16.937,0,0,0-2.873,11.207,11.456,11.456,0,0,0,3.788,7.155,3.033,3.033,0,0,0,2.422.575,11.924,11.924,0,0,0,8.566-7.568c2.681-7.159.854-13.86-2.237-20.921A32.466,32.466,0,0,0,53.267,11.278Z"/>
<path class="petal" d="M43.2,11.743a16.938,16.938,0,0,0,.126,11.569,11.457,11.457,0,0,0,5.511,5.931,3.033,3.033,0,0,0,2.488-.072,11.924,11.924,0,0,0,6.315-9.528C58.379,12.034,54.88,6.034,50.066.014A32.468,32.468,0,0,0,43.2,11.743Z"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 6.0 KiB

1
web/public/Wikipedia.svg Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,208 @@
"use client";
import * as Yup from "yup";
import { MediaWikiIcon, TrashIcon } from "@/components/icons/icons";
import {
TextArrayField,
TextArrayFieldBuilder,
TextFormField,
} from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
import {
MediaWikiCredentialJson,
MediaWikiConfig,
ConnectorIndexingStatus,
Credential,
} from "@/lib/types";
import useSWR, { useSWRConfig } from "swr";
import { fetcher } from "@/lib/fetcher";
import { LoadingAnimation } from "@/components/Loading";
import { adminDeleteCredential, linkCredential } from "@/lib/credential";
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
import { usePopup } from "@/components/admin/connectors/Popup";
import { usePublicCredentials } from "@/lib/hooks";
import { AdminPageTitle } from "@/components/admin/Title";
import { Card, Text, Title } from "@tremor/react";
const Main = () => {
const { popup, setPopup } = usePopup();
const { mutate } = useSWRConfig();
const {
data: connectorIndexingStatuses,
isLoading: isConnectorIndexingStatusesLoading,
error: isConnectorIndexingStatusesError,
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
"/api/manage/admin/connector/indexing-status",
fetcher
);
const {
data: credentialsData,
isLoading: isCredentialsLoading,
error: isCredentialsError,
refreshCredentials,
} = usePublicCredentials();
if (
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
(!credentialsData && isCredentialsLoading)
) {
return <LoadingAnimation text="Loading" />;
}
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
return <div>Failed to load connectors</div>;
}
if (isCredentialsError || !credentialsData) {
return <div>Failed to load credentials</div>;
}
const mediawikiConnectorIndexingStatuses: ConnectorIndexingStatus<
MediaWikiConfig,
MediaWikiCredentialJson
>[] = connectorIndexingStatuses.filter(
(connectorIndexingStatus) =>
connectorIndexingStatus.connector.source === "mediawiki"
);
const mediawikiCredential: Credential<MediaWikiCredentialJson> | undefined =
credentialsData.find((credential) => true);
return (
<>
{popup}
{mediawikiConnectorIndexingStatuses.length > 0 && (
<>
<Title className="mb-2 mt-6 ml-auto mr-auto">
MediaWiki indexing status
</Title>
<Text className="mb-2">
The latest page, chapter, book and shelf changes are fetched every
10 minutes.
</Text>
<div className="mb-2">
<ConnectorsTable<MediaWikiConfig, MediaWikiCredentialJson>
connectorIndexingStatuses={mediawikiConnectorIndexingStatuses}
liveCredential={mediawikiCredential}
getCredential={(credential) => {
return <div></div>;
}}
onCredentialLink={async (connectorId) => {
if (mediawikiCredential) {
await linkCredential(connectorId, mediawikiCredential.id);
mutate("/api/manage/admin/connector/indexing-status");
}
}}
onUpdate={() =>
mutate("/api/manage/admin/connector/indexing-status")
}
/>
</div>
</>
)}
{mediawikiCredential && (
<>
<Card className="mt-4">
<h2 className="font-bold mb-3">Create Connection</h2>
<Text className="mb-4">
Press connect below to start the connection to your MediaWiki
instance.
</Text>
<ConnectorForm<MediaWikiConfig>
nameBuilder={(values) =>
`MediaWikiConnector-${values.connector_name}`
}
ccPairNameBuilder={(values) =>
`MediaWikiConnector-${values.connector_name}`
}
source="mediawiki"
inputType="poll"
formBodyBuilder={(values) => (
<div>
<TextFormField
name="connector_name"
label="Connector Name:"
/>
<TextFormField name="hostname" label="MediaWiki Site URL:" />
<TextFormField
name="language_code"
label="MediaWiki Site Language Code (e.g. 'en', 'sp', etc...):"
/>
{TextArrayFieldBuilder({
name: "pages",
label: "Pages to index:",
subtext:
"Specify 0 or more names of pages to index. Only specify the name of the page, not its url.",
})(values)}
{TextArrayFieldBuilder({
name: "categories",
label: "Categories to index:",
subtext:
"Specify 0 or more names of categories to index. For most MediaWiki sites, these are pages" +
" with a name of the form 'Category: XYZ', that are lists of other pages/categories. Only" +
" specify the name of the category, not its url.",
})(values)}
<TextFormField
name="recurse_depth"
label="Recursion Depth:"
type="number"
subtext="When indexing categories that have sub-categories, this will determine how may levels to index. Specify 0 to only index the category itself (i.e. no recursion). Specify -1 for unlimited recursion depth. Note, that in some rare instances, a category might contain itself in its dependencies, which will cause an infinite loop. Only use -1 if you confident that this will not happen."
/>
</div>
)}
validationSchema={Yup.object().shape({
connector_name: Yup.string().required(
"Please enter a name for your MediaWiki connector."
),
hostname: Yup.string().required(
"Please enter the base URL for your MediaWiki site"
),
language_code: Yup.string().default("en"),
categories: Yup.array().of(
Yup.string().required(
"Please enter categories to index from your MediaWiki site"
)
),
pages: Yup.array().of(
Yup.string().required(
"Please enter pages to index from your MediaWiki site"
)
),
recurse_depth: Yup.number().required(
"Please enter the recursion depth for your MediaWiki site."
),
})}
initialValues={{
connector_name: "",
hostname: "",
language_code: "en",
categories: [],
pages: [],
recurse_depth: 0,
}}
refreshFreq={10 * 60} // 10 minutes
credentialId={mediawikiCredential.id}
/>
</Card>
</>
)}
</>
);
};
export default function Page() {
return (
<div className="mx-auto container">
<div className="mb-4">
<HealthCheckBanner />
</div>
<AdminPageTitle icon={<MediaWikiIcon size={32} />} title="MediaWiki" />
<Main />
</div>
);
}

View File

@@ -0,0 +1,203 @@
"use client";
import * as Yup from "yup";
import { WikipediaIcon, TrashIcon } from "@/components/icons/icons";
import {
TextArrayField,
TextArrayFieldBuilder,
TextFormField,
} from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
import {
WikipediaCredentialJson,
WikipediaConfig,
ConnectorIndexingStatus,
Credential,
} from "@/lib/types";
import useSWR, { useSWRConfig } from "swr";
import { fetcher } from "@/lib/fetcher";
import { LoadingAnimation } from "@/components/Loading";
import { adminDeleteCredential, linkCredential } from "@/lib/credential";
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
import { usePopup } from "@/components/admin/connectors/Popup";
import { usePublicCredentials } from "@/lib/hooks";
import { AdminPageTitle } from "@/components/admin/Title";
import { Card, Text, Title } from "@tremor/react";
const Main = () => {
const { popup, setPopup } = usePopup();
const { mutate } = useSWRConfig();
const {
data: connectorIndexingStatuses,
isLoading: isConnectorIndexingStatusesLoading,
error: isConnectorIndexingStatusesError,
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
"/api/manage/admin/connector/indexing-status",
fetcher
);
const {
data: credentialsData,
isLoading: isCredentialsLoading,
error: isCredentialsError,
refreshCredentials,
} = usePublicCredentials();
if (
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
(!credentialsData && isCredentialsLoading)
) {
return <LoadingAnimation text="Loading" />;
}
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
return <div>Failed to load connectors</div>;
}
if (isCredentialsError || !credentialsData) {
return <div>Failed to load credentials</div>;
}
const wikipediaConnectorIndexingStatuses: ConnectorIndexingStatus<
WikipediaConfig,
WikipediaCredentialJson
>[] = connectorIndexingStatuses.filter(
(connectorIndexingStatus) =>
connectorIndexingStatus.connector.source === "wikipedia"
);
const wikipediaCredential: Credential<WikipediaCredentialJson> | undefined =
credentialsData.find((credential) => true);
return (
<>
{popup}
{wikipediaConnectorIndexingStatuses.length > 0 && (
<>
<Title className="mb-2 mt-6 ml-auto mr-auto">
Wikipedia indexing status
</Title>
<Text className="mb-2">
The latest page, chapter, book and shelf changes are fetched every
10 minutes.
</Text>
<div className="mb-2">
<ConnectorsTable<WikipediaConfig, WikipediaCredentialJson>
connectorIndexingStatuses={wikipediaConnectorIndexingStatuses}
liveCredential={wikipediaCredential}
getCredential={(credential) => {
return <div></div>;
}}
onCredentialLink={async (connectorId) => {
if (wikipediaCredential) {
await linkCredential(connectorId, wikipediaCredential.id);
mutate("/api/manage/admin/connector/indexing-status");
}
}}
onUpdate={() =>
mutate("/api/manage/admin/connector/indexing-status")
}
/>
</div>
</>
)}
{wikipediaCredential && (
<>
<Card className="mt-4">
<h2 className="font-bold mb-3">Create Connection</h2>
<Text className="mb-4">
Press connect below to start the connection to your Wikipedia
instance.
</Text>
<ConnectorForm<WikipediaConfig>
nameBuilder={(values) =>
`WikipediaConnector-${values.connector_name}`
}
ccPairNameBuilder={(values) =>
`WikipediaConnector-${values.connector_name}`
}
source="wikipedia"
inputType="poll"
formBodyBuilder={(values) => (
<div>
<TextFormField
name="connector_name"
label="Connector Name:"
/>
<TextFormField
name="language_code"
label="Wikipedia Site Language Code (e.g. 'en', 'sp', etc...):"
/>
{TextArrayFieldBuilder({
name: "pages",
label: "Pages to index:",
subtext:
"Specify 0 or more names of pages to index. Only specify the name of the page, not its url.",
})(values)}
{TextArrayFieldBuilder({
name: "categories",
label: "Categories to index:",
subtext:
"Specify 0 or more names of categories to index. These are pages" +
" with a name of the form 'Category: XYZ', that are lists of other pages/categories. Only" +
" specify the name of the category, not its url.",
})(values)}
<TextFormField
name="recurse_depth"
label="Recursion Depth:"
type="number"
subtext="When indexing categories that have sub-categories, this will determine how may levels to index. Specify 0 to only index the category itself (i.e. no recursion). Specify -1 for unlimited recursion depth. Note, that in some rare instances, a category might contain itself in its dependencies, which will cause an infinite loop. Only use -1 if you confident that this will not happen."
/>
</div>
)}
validationSchema={Yup.object().shape({
connector_name: Yup.string().required(
"Please enter a name for your Wikipedia connector."
),
language_code: Yup.string().default("en"),
categories: Yup.array().of(
Yup.string().required(
"Please enter categories to index from your Wikipedia site"
)
),
pages: Yup.array().of(
Yup.string().required(
"Please enter pages to index from your Wikipedia site"
)
),
recurse_depth: Yup.number().required(
"Please enter the recursion depth for your Wikipedia site."
),
})}
initialValues={{
connector_name: "",
language_code: "en",
categories: [],
pages: [],
recurse_depth: 0,
}}
refreshFreq={10 * 60} // 10 minutes
credentialId={wikipediaCredential.id}
/>
</Card>
</>
)}
</>
);
};
export default function Page() {
return (
<div className="mx-auto container">
<div className="mb-4">
<HealthCheckBanner />
</div>
<AdminPageTitle icon={<WikipediaIcon size={32} />} title="Wikipedia" />
<Main />
</div>
);
}

View File

@@ -52,6 +52,8 @@ import document360Icon from "../../../public/Document360.png";
import googleSitesIcon from "../../../public/GoogleSites.png";
import zendeskIcon from "../../../public/Zendesk.svg";
import sharepointIcon from "../../../public/Sharepoint.png";
import mediawikiIcon from "../../../public/MediaWiki.svg";
import wikipediaIcon from "../../../public/Wikipedia.svg";
import discourseIcon from "../../../public/Discourse.png";
import { FaRobot } from "react-icons/fa";
@@ -625,3 +627,27 @@ export const AxeroIcon = ({
<Image src="/Axero.jpeg" alt="Logo" width="96" height="96" />
</div>
);
export const MediaWikiIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<div
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
>
<Image src={mediawikiIcon} alt="Logo" width="96" height="96" />
</div>
);
export const WikipediaIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<div
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
>
<Image src={wikipediaIcon} alt="Logo" width="96" height="96" />
</div>
);

View File

@@ -25,6 +25,8 @@ import {
SlackIcon,
ZendeskIcon,
ZulipIcon,
MediaWikiIcon,
WikipediaIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import { SourceCategory, SourceMetadata } from "./search/interfaces";
@@ -166,6 +168,16 @@ const SOURCE_METADATA_MAP: SourceMap = {
displayName: "Axero",
category: SourceCategory.AppConnection,
},
wikipedia: {
icon: WikipediaIcon,
displayName: "Wikipedia",
category: SourceCategory.AppConnection,
},
mediawiki: {
icon: MediaWikiIcon,
displayName: "MediaWiki",
category: SourceCategory.AppConnection,
},
requesttracker: {
icon: RequestTrackerIcon,
displayName: "Request Tracker",

View File

@@ -40,7 +40,9 @@ export type ValidSources =
| "sharepoint"
| "zendesk"
| "discourse"
| "axero";
| "axero"
| "wikipedia"
| "mediawiki";
export type ValidInputTypes = "load_state" | "poll" | "event";
export type ValidStatuses =
@@ -179,6 +181,19 @@ export interface GoogleSitesConfig {
export interface ZendeskConfig {}
export interface MediaWikiBaseConfig {
connector_name: string;
language_code: string;
categories?: string[];
pages?: string[];
recurse_depth?: number;
}
export interface MediaWikiConfig extends MediaWikiBaseConfig {
hostname: string;
}
export interface WikipediaConfig extends MediaWikiBaseConfig {}
export interface IndexAttemptSnapshot {
id: number;
status: ValidStatuses | null;
@@ -353,6 +368,9 @@ export interface AxeroCredentialJson {
axero_api_token: string;
}
export interface MediaWikiCredentialJson {}
export interface WikipediaCredentialJson extends MediaWikiCredentialJson {}
// DELETION
export interface DeletionAttemptSnapshot {