Confluence: Add config to index only active pages (#1348)

Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
This commit is contained in:
mattboret 2024-05-03 18:04:09 +02:00 committed by GitHub
parent 143b50c519
commit 2ff207218e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 13 additions and 1 deletions

View File

@ -3,7 +3,6 @@ import os
from danswer.configs.constants import AuthType
from danswer.configs.constants import DocumentIndexType
#####
# App Configs
#####
@ -167,6 +166,12 @@ CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
)
if ignored_tag
]
# Avoid to get archived pages
CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES = (
os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true"
)
JIRA_CONNECTOR_LABELS_TO_SKIP = [
ignored_tag
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")

View File

@ -11,6 +11,7 @@ import bs4
from atlassian import Confluence # type:ignore
from requests import HTTPError
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
@ -219,6 +220,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
self.space,
start=start_ind,
limit=batch_size,
status="current"
if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
else None,
expand="body.storage.value,version",
)
except Exception:
@ -237,6 +241,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
self.space,
start=start_ind + i,
limit=1,
status="current"
if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
else None,
expand="body.storage.value,version",
)
)