mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-12 22:23:01 +02:00
Continue on some connector failures (#314)
This commit is contained in:
@ -157,6 +157,12 @@ DYNAMIC_CONFIG_STORE = os.environ.get(
|
|||||||
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
|
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
|
||||||
# notset, debug, info, warning, error, or critical
|
# notset, debug, info, warning, error, or critical
|
||||||
LOG_LEVEL = os.environ.get("LOG_LEVEL", "info")
|
LOG_LEVEL = os.environ.get("LOG_LEVEL", "info")
|
||||||
|
# NOTE: Currently only supported in the Confluence and Google Drive connectors +
|
||||||
|
# only handles some failures (Confluence = handles API call failures, Google
|
||||||
|
# Drive = handles failures pulling files / parsing them)
|
||||||
|
CONTINUE_ON_CONNECTOR_FAILURE = os.environ.get(
|
||||||
|
"CONTINUE_ON_CONNECTOR_FAILURE", ""
|
||||||
|
).lower() not in ["false", ""]
|
||||||
|
|
||||||
|
|
||||||
#####
|
#####
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Generator
|
from collections.abc import Collection
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@ -7,6 +7,7 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
from atlassian import Confluence # type:ignore
|
from atlassian import Confluence # type:ignore
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
@ -16,8 +17,11 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.text_processing import parse_html_page_basic
|
from danswer.utils.text_processing import parse_html_page_basic
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
# Potential Improvements
|
# Potential Improvements
|
||||||
# 1. If wiki page instead of space, do a search of all the children of the page instead of index all in the space
|
# 1. If wiki page instead of space, do a search of all the children of the page instead of index all in the space
|
||||||
# 2. Include attachments, etc
|
# 2. Include attachments, etc
|
||||||
@ -48,7 +52,7 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
|||||||
|
|
||||||
def _comment_dfs(
|
def _comment_dfs(
|
||||||
comments_str: str,
|
comments_str: str,
|
||||||
comment_pages: Generator[dict[str, Any], None, None],
|
comment_pages: Collection[dict[str, Any]],
|
||||||
confluence_client: Confluence,
|
confluence_client: Confluence,
|
||||||
) -> str:
|
) -> str:
|
||||||
for comment_page in comment_pages:
|
for comment_page in comment_pages:
|
||||||
@ -72,8 +76,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
self,
|
self,
|
||||||
wiki_page_url: str,
|
wiki_page_url: str,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
|
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
|
||||||
self.confluence_client: Confluence | None = None
|
self.confluence_client: Confluence | None = None
|
||||||
|
|
||||||
@ -88,6 +94,57 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _fetch_pages(
|
||||||
|
self,
|
||||||
|
confluence_client: Confluence,
|
||||||
|
start_ind: int,
|
||||||
|
) -> Collection[dict[str, Any]]:
|
||||||
|
def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
|
||||||
|
return confluence_client.get_all_pages_from_space(
|
||||||
|
self.space,
|
||||||
|
start=start_ind,
|
||||||
|
limit=batch_size,
|
||||||
|
expand="body.storage.value,version",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return _fetch(start_ind, self.batch_size)
|
||||||
|
except Exception as e:
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# error checking phase, only reachable if `self.continue_on_failure=True`
|
||||||
|
pages: list[dict[str, Any]] = []
|
||||||
|
for i in range(self.batch_size):
|
||||||
|
try:
|
||||||
|
pages.extend(_fetch(start_ind + i, 1))
|
||||||
|
except:
|
||||||
|
logger.exception(
|
||||||
|
"Ran into exception when fetching pages from Confluence"
|
||||||
|
)
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def _fetch_comments(
|
||||||
|
self, confluence_client: Confluence, page_id: str
|
||||||
|
) -> Collection[dict[str, Any]]:
|
||||||
|
try:
|
||||||
|
return confluence_client.get_page_child_by_type(
|
||||||
|
page_id,
|
||||||
|
type="comment",
|
||||||
|
start=None,
|
||||||
|
limit=None,
|
||||||
|
expand="body.storage.value",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
logger.exception(
|
||||||
|
"Ran into exception when fetching comments from Confluence"
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
def _get_doc_batch(
|
def _get_doc_batch(
|
||||||
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
||||||
) -> tuple[list[Document], int]:
|
) -> tuple[list[Document], int]:
|
||||||
@ -96,13 +153,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
if self.confluence_client is None:
|
if self.confluence_client is None:
|
||||||
raise ConnectorMissingCredentialError("Confluence")
|
raise ConnectorMissingCredentialError("Confluence")
|
||||||
|
|
||||||
batch = self.confluence_client.get_all_pages_from_space(
|
batch = self._fetch_pages(self.confluence_client, start_ind)
|
||||||
self.space,
|
|
||||||
start=start_ind,
|
|
||||||
limit=self.batch_size,
|
|
||||||
expand="body.storage.value,version",
|
|
||||||
)
|
|
||||||
|
|
||||||
for page in batch:
|
for page in batch:
|
||||||
last_modified_str = page["version"]["when"]
|
last_modified_str = page["version"]["when"]
|
||||||
last_modified = datetime.fromisoformat(last_modified_str)
|
last_modified = datetime.fromisoformat(last_modified_str)
|
||||||
@ -112,13 +163,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
page_text = (
|
page_text = (
|
||||||
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
||||||
)
|
)
|
||||||
comment_pages = self.confluence_client.get_page_child_by_type(
|
comment_pages = self._fetch_comments(self.confluence_client, page["id"])
|
||||||
page["id"],
|
|
||||||
type="comment",
|
|
||||||
start=None,
|
|
||||||
limit=None,
|
|
||||||
expand="body.storage.value",
|
|
||||||
)
|
|
||||||
comments_text = _comment_dfs("", comment_pages, self.confluence_client)
|
comments_text = _comment_dfs("", comment_pages, self.confluence_client)
|
||||||
page_text += comments_text
|
page_text += comments_text
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ from google.oauth2.credentials import Credentials # type: ignore
|
|||||||
from googleapiclient import discovery # type: ignore
|
from googleapiclient import discovery # type: ignore
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
||||||
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
@ -293,11 +294,13 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
|||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||||
|
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.folder_paths = folder_paths or []
|
self.folder_paths = folder_paths or []
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.include_shared = include_shared
|
self.include_shared = include_shared
|
||||||
self.follow_shortcuts = follow_shortcuts
|
self.follow_shortcuts = follow_shortcuts
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
self.creds: Credentials | None = None
|
self.creds: Credentials | None = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -376,18 +379,28 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
|||||||
for files_batch in file_batches:
|
for files_batch in file_batches:
|
||||||
doc_batch = []
|
doc_batch = []
|
||||||
for file in files_batch:
|
for file in files_batch:
|
||||||
|
try:
|
||||||
text_contents = extract_text(file, service)
|
text_contents = extract_text(file, service)
|
||||||
full_context = file["name"] + " - " + text_contents
|
full_context = file["name"] + " - " + text_contents
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
id=file["webViewLink"],
|
id=file["webViewLink"],
|
||||||
sections=[Section(link=file["webViewLink"], text=full_context)],
|
sections=[
|
||||||
|
Section(link=file["webViewLink"], text=full_context)
|
||||||
|
],
|
||||||
source=DocumentSource.GOOGLE_DRIVE,
|
source=DocumentSource.GOOGLE_DRIVE,
|
||||||
semantic_identifier=file["name"],
|
semantic_identifier=file["name"],
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
logger.exception(
|
||||||
|
"Ran into exception when pulling a file from Google Drive"
|
||||||
|
)
|
||||||
|
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
|
|
||||||
|
@ -68,6 +68,7 @@ services:
|
|||||||
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
|
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
|
||||||
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
|
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
|
||||||
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
|
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
|
||||||
|
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
|
||||||
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
|
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
|
||||||
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
|
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
|
Reference in New Issue
Block a user