mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 05:05:34 +02:00
Confluence: add pages labels indexation (#1635)
* Confluence: add pages labels indexation * changed the default and fixed the dict building * Update app_configs.py * Update connector.py --------- Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net> Co-authored-by: hagen-danswer <hagen@danswer.ai> Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
This commit is contained in:
@@ -184,6 +184,12 @@ CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES = (
|
|||||||
os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true"
|
os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Save pages labels as Danswer metadata tags
|
||||||
|
# The reason to skip this would be to reduce the number of calls to Confluence due to rate limit concerns
|
||||||
|
CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = (
|
||||||
|
os.environ.get("CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING", "").lower() == "true"
|
||||||
|
)
|
||||||
|
|
||||||
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
||||||
ignored_tag
|
ignored_tag
|
||||||
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
||||||
|
@@ -15,6 +15,7 @@ from requests import HTTPError
|
|||||||
|
|
||||||
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
|
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
|
||||||
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
||||||
|
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING
|
||||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
@@ -404,10 +405,11 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if time_filter is None or time_filter(last_modified):
|
if time_filter is None or time_filter(last_modified):
|
||||||
page_id = page["id"]
|
page_id = page["id"]
|
||||||
|
if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
|
||||||
|
page_labels = self._fetch_labels(self.confluence_client, page_id)
|
||||||
|
|
||||||
# check disallowed labels
|
# check disallowed labels
|
||||||
if self.labels_to_skip:
|
if self.labels_to_skip:
|
||||||
page_labels = self._fetch_labels(self.confluence_client, page_id)
|
|
||||||
label_intersection = self.labels_to_skip.intersection(page_labels)
|
label_intersection = self.labels_to_skip.intersection(page_labels)
|
||||||
if label_intersection:
|
if label_intersection:
|
||||||
logger.info(
|
logger.info(
|
||||||
@@ -435,6 +437,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
||||||
page_text += comments_text
|
page_text += comments_text
|
||||||
|
|
||||||
|
doc_metadata: dict[str, str | list[str]] = {
|
||||||
|
"Wiki Space Name": self.space
|
||||||
|
}
|
||||||
|
if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels:
|
||||||
|
doc_metadata["labels"] = page_labels
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
id=page_url,
|
id=page_url,
|
||||||
@@ -445,9 +453,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
primary_owners=[BasicExpertInfo(email=author)]
|
primary_owners=[BasicExpertInfo(email=author)]
|
||||||
if author
|
if author
|
||||||
else None,
|
else None,
|
||||||
metadata={
|
metadata=doc_metadata,
|
||||||
"Wiki Space Name": self.space,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return doc_batch, len(batch)
|
return doc_batch, len(batch)
|
||||||
|
Reference in New Issue
Block a user