mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-02 16:00:34 +02:00
Add slim connector description (#3303)
* added docs example and test * updated docs * needed to make the tests run * updated docs
This commit is contained in:
parent
813445ab59
commit
5385bae100
@ -24,6 +24,8 @@ env:
|
||||
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
|
||||
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
# Slab
|
||||
SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
|
||||
|
||||
jobs:
|
||||
connectors-check:
|
||||
|
@ -11,11 +11,16 @@ Connectors come in 3 different flows:
|
||||
- Load Connector:
|
||||
- Bulk indexes documents to reflect a point in time. This type of connector generally works by either pulling all
|
||||
documents via a connector's API or loads the documents from some sort of a dump file.
|
||||
- Poll connector:
|
||||
- Poll Connector:
|
||||
- Incrementally updates documents based on a provided time range. It is used by the background job to pull the latest
|
||||
changes and additions since the last round of polling. This connector helps keep the document index up to date
|
||||
without needing to fetch/embed/index every document which would be too slow to do frequently on large sets of
|
||||
documents.
|
||||
- Slim Connector:
|
||||
- This connector should be a lighter weight method of checking all documents in the source to see if they still exist.
|
||||
- This connector should be identical to the Poll or Load Connector except that it only fetches the IDs of the documents, not the documents themselves.
|
||||
- This is used by our pruning job which removes old documents from the index.
|
||||
- The optional start and end datetimes can be ignored.
|
||||
- Event Based connectors:
|
||||
- Connectors that listen to events and update documents accordingly.
|
||||
- Currently not used by the background job, this exists for future design purposes.
|
||||
@ -26,8 +31,14 @@ Refer to [interfaces.py](https://github.com/danswer-ai/danswer/blob/main/backend
|
||||
and this first contributor created Pull Request for a new connector (Shoutout to Dan Brown):
|
||||
[Reference Pull Request](https://github.com/danswer-ai/danswer/pull/139)
|
||||
|
||||
For implementing a Slim Connector, refer to the comments in this PR:
|
||||
[Slim Connector PR](https://github.com/danswer-ai/danswer/pull/3303/files)
|
||||
|
||||
All new connectors should have tests added to the `backend/tests/daily/connectors` directory. Refer to the above PR for an example of adding tests for a new connector.
|
||||
|
||||
|
||||
#### Implementing the new Connector
|
||||
The connector must subclass one or more of LoadConnector, PollConnector, or EventConnector.
|
||||
The connector must subclass one or more of LoadConnector, PollConnector, SlimConnector, or EventConnector.
|
||||
|
||||
The `__init__` should take arguments for configuring what documents the connector will and where it finds those
|
||||
documents. For example, if you have a wiki site, it may include the configuration for the team, topic, folder, etc. of
|
||||
|
@ -12,12 +12,15 @@ from dateutil import parser
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.interfaces import SlimConnector
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.models import SlimDocument
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -28,6 +31,8 @@ logger = setup_logger()
|
||||
SLAB_GRAPHQL_MAX_TRIES = 10
|
||||
SLAB_API_URL = "https://api.slab.com/v1/graphql"
|
||||
|
||||
_SLIM_BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def run_graphql_request(
|
||||
graphql_query: dict, bot_token: str, max_tries: int = SLAB_GRAPHQL_MAX_TRIES
|
||||
@ -158,21 +163,26 @@ def get_slab_url_from_title_id(base_url: str, title: str, page_id: str) -> str:
|
||||
return urljoin(urljoin(base_url, "posts/"), url_id)
|
||||
|
||||
|
||||
class SlabConnector(LoadConnector, PollConnector):
|
||||
class SlabConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
slab_bot_token: str | None = None,
|
||||
) -> None:
|
||||
self.base_url = base_url
|
||||
self.batch_size = batch_size
|
||||
self.slab_bot_token = slab_bot_token
|
||||
self._slab_bot_token: str | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
self.slab_bot_token = credentials["slab_bot_token"]
|
||||
self._slab_bot_token = credentials["slab_bot_token"]
|
||||
return None
|
||||
|
||||
@property
|
||||
def slab_bot_token(self) -> str:
|
||||
if self._slab_bot_token is None:
|
||||
raise ConnectorMissingCredentialError("Slab")
|
||||
return self._slab_bot_token
|
||||
|
||||
def _iterate_posts(
|
||||
self, time_filter: Callable[[datetime], bool] | None = None
|
||||
) -> GenerateDocumentsOutput:
|
||||
@ -227,3 +237,21 @@ class SlabConnector(LoadConnector, PollConnector):
|
||||
yield from self._iterate_posts(
|
||||
time_filter=lambda t: start_time <= t <= end_time
|
||||
)
|
||||
|
||||
def retrieve_all_slim_documents(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
slim_doc_batch: list[SlimDocument] = []
|
||||
for post_id in get_all_post_ids(self.slab_bot_token):
|
||||
slim_doc_batch.append(
|
||||
SlimDocument(
|
||||
id=post_id,
|
||||
)
|
||||
)
|
||||
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
|
||||
yield slim_doc_batch
|
||||
slim_doc_batch = []
|
||||
if slim_doc_batch:
|
||||
yield slim_doc_batch
|
||||
|
88
backend/tests/daily/connectors/slab/test_slab_connector.py
Normal file
88
backend/tests/daily/connectors/slab/test_slab_connector.py
Normal file
@ -0,0 +1,88 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.slab.connector import SlabConnector
|
||||
|
||||
|
||||
def load_test_data(file_name: str = "test_slab_data.json") -> dict[str, str]:
|
||||
current_dir = Path(__file__).parent
|
||||
with open(current_dir / file_name, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def slab_connector() -> SlabConnector:
|
||||
connector = SlabConnector(
|
||||
base_url="https://onyx-test.slab.com/",
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"slab_bot_token": os.environ["SLAB_BOT_TOKEN"],
|
||||
}
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Need a test account with a slab subscription to run this test."
|
||||
"Trial only lasts 14 days."
|
||||
)
|
||||
)
|
||||
def test_slab_connector_basic(slab_connector: SlabConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
target_test_doc_id = "jcp6cohu"
|
||||
target_test_doc: Document | None = None
|
||||
for doc_batch in slab_connector.poll_source(0, time.time()):
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
if doc.id == target_test_doc_id:
|
||||
target_test_doc = doc
|
||||
|
||||
assert len(all_docs) == 6
|
||||
assert target_test_doc is not None
|
||||
|
||||
desired_test_data = load_test_data()
|
||||
assert (
|
||||
target_test_doc.semantic_identifier == desired_test_data["semantic_identifier"]
|
||||
)
|
||||
assert target_test_doc.source == DocumentSource.SLAB
|
||||
assert target_test_doc.metadata == {}
|
||||
assert target_test_doc.primary_owners is None
|
||||
assert target_test_doc.secondary_owners is None
|
||||
assert target_test_doc.title is None
|
||||
assert target_test_doc.from_ingestion_api is False
|
||||
assert target_test_doc.additional_info is None
|
||||
|
||||
assert len(target_test_doc.sections) == 1
|
||||
section = target_test_doc.sections[0]
|
||||
# Need to replace the weird apostrophe with a normal one
|
||||
assert section.text.replace("\u2019", "'") == desired_test_data["section_text"]
|
||||
assert section.link == desired_test_data["link"]
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Need a test account with a slab subscription to run this test."
|
||||
"Trial only lasts 14 days."
|
||||
)
|
||||
)
|
||||
def test_slab_connector_slim(slab_connector: SlabConnector) -> None:
|
||||
# Get all doc IDs from the full connector
|
||||
all_full_doc_ids = set()
|
||||
for doc_batch in slab_connector.load_from_state():
|
||||
all_full_doc_ids.update([doc.id for doc in doc_batch])
|
||||
|
||||
# Get all doc IDs from the slim connector
|
||||
all_slim_doc_ids = set()
|
||||
for slim_doc_batch in slab_connector.retrieve_all_slim_documents():
|
||||
all_slim_doc_ids.update([doc.id for doc in slim_doc_batch])
|
||||
|
||||
# The set of full doc IDs should be always be a subset of the slim doc IDs
|
||||
assert all_full_doc_ids.issubset(all_slim_doc_ids)
|
5
backend/tests/daily/connectors/slab/test_slab_data.json
Normal file
5
backend/tests/daily/connectors/slab/test_slab_data.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"section_text": "Learn about Posts\nWelcome\nThis is a post, where you can edit, share, and collaborate in real time with your team. We'd love to show you how it works!\nReading and editing\nClick the mode button to toggle between read and edit modes. You can only make changes to a post when editing.\nOrganize your posts\nWhen in edit mode, you can add topics to a post, which will keep it organized for the right 👀 to see.\nSmart mentions\nMentions are references to users, posts, topics and third party tools that show details on hover. Paste in a link for automatic conversion.\nLook back in time\nYou are ready to begin writing. You can always bring back this tour in the help menu.\nGreat job!\nYou are ready to begin writing. You can always bring back this tour in the help menu.\n\n",
|
||||
"link": "https://onyx-test.slab.com/posts/learn-about-posts-jcp6cohu",
|
||||
"semantic_identifier": "Learn about Posts"
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user