From 65d5808ea7f864ada441ce0b774d74994d89b009 Mon Sep 17 00:00:00 2001 From: mattboret Date: Tue, 25 Jun 2024 19:41:29 +0200 Subject: [PATCH] Confluence: add pages labels indexation (#1635) * Confluence: add pages labels indexation * changed the default and fixed the dict building * Update app_configs.py * Update connector.py --------- Co-authored-by: Matthieu Boret Co-authored-by: hagen-danswer Co-authored-by: Yuhong Sun --- backend/danswer/configs/app_configs.py | 6 ++++++ backend/danswer/connectors/confluence/connector.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index d12c0d03c..222fddcb1 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -184,6 +184,12 @@ CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES = ( os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true" ) +# Save pages labels as Danswer metadata tags +# The reason to skip this would be to reduce the number of calls to Confluence due to rate limit concerns +CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = ( + os.environ.get("CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING", "").lower() == "true" +) + JIRA_CONNECTOR_LABELS_TO_SKIP = [ ignored_tag for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",") diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 29fe5fbc3..d6814ab97 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -15,6 +15,7 @@ from requests import HTTPError from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP +from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -404,10 +405,11 @@ class ConfluenceConnector(LoadConnector, PollConnector): if time_filter is None or time_filter(last_modified): page_id = page["id"] + if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING: + page_labels = self._fetch_labels(self.confluence_client, page_id) # check disallowed labels if self.labels_to_skip: - page_labels = self._fetch_labels(self.confluence_client, page_id) label_intersection = self.labels_to_skip.intersection(page_labels) if label_intersection: logger.info( @@ -435,6 +437,12 @@ class ConfluenceConnector(LoadConnector, PollConnector): comments_text = self._fetch_comments(self.confluence_client, page_id) page_text += comments_text + doc_metadata: dict[str, str | list[str]] = { + "Wiki Space Name": self.space + } + if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels: + doc_metadata["labels"] = page_labels + doc_batch.append( Document( id=page_url, @@ -445,9 +453,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): primary_owners=[BasicExpertInfo(email=author)] if author else None, - metadata={ - "Wiki Space Name": self.space, - }, + metadata=doc_metadata, ) ) return doc_batch, len(batch)