Confluence: add pages labels indexation (#1635)

* Confluence: add pages labels indexation

* changed the default and fixed the dict building

* Update app_configs.py

* Update connector.py

---------

Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
Co-authored-by: hagen-danswer <hagen@danswer.ai>
Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
This commit is contained in:
mattboret 2024-06-25 19:41:29 +02:00 committed by GitHub
parent 061dab7f37
commit 65d5808ea7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 4 deletions

View File

@ -184,6 +184,12 @@ CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES = (
os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true"
)
# Save pages labels as Danswer metadata tags
# The reason to skip this would be to reduce the number of calls to Confluence due to rate limit concerns
CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = (
os.environ.get("CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING", "").lower() == "true"
)
JIRA_CONNECTOR_LABELS_TO_SKIP = [
ignored_tag
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")

View File

@ -15,6 +15,7 @@ from requests import HTTPError
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
@ -404,10 +405,11 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if time_filter is None or time_filter(last_modified):
page_id = page["id"]
if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
page_labels = self._fetch_labels(self.confluence_client, page_id)
# check disallowed labels
if self.labels_to_skip:
page_labels = self._fetch_labels(self.confluence_client, page_id)
label_intersection = self.labels_to_skip.intersection(page_labels)
if label_intersection:
logger.info(
@ -435,6 +437,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text
doc_metadata: dict[str, str | list[str]] = {
"Wiki Space Name": self.space
}
if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels:
doc_metadata["labels"] = page_labels
doc_batch.append(
Document(
id=page_url,
@ -445,9 +453,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
primary_owners=[BasicExpertInfo(email=author)]
if author
else None,
metadata={
"Wiki Space Name": self.space,
},
metadata=doc_metadata,
)
)
return doc_batch, len(batch)