diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 3c682e3a75..29fe5fbc3a 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -366,7 +366,9 @@ class ConfluenceConnector(LoadConnector, PollConnector): if response.status_code == 200: extract = extract_file_text( - attachment["title"], io.BytesIO(response.content) + attachment["title"], + io.BytesIO(response.content), + break_on_unprocessable=False, ) files_attachment_content.append(extract) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 14cca7f6f9..4a56eeec40 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -3,6 +3,7 @@ import json import os import re import zipfile +from collections.abc import Callable from collections.abc import Iterator from email.parser import Parser as EmailParser from pathlib import Path @@ -65,6 +66,16 @@ def check_file_ext_is_valid(ext: str) -> bool: return ext in VALID_FILE_EXTENSIONS +def is_text_file(file: IO[bytes]) -> bool: + """ + checks if the first 1024 bytes only contain printable or whitespace characters + if it does, then we say its a 'txt' + """ + raw_data = file.read(1024) + text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}) + return all(c in text_chars for c in raw_data) + + def detect_encoding(file: IO[bytes]) -> str: raw_data = file.read(50000) encoding = chardet.detect(raw_data)["encoding"] or "utf-8" @@ -261,37 +272,34 @@ def extract_file_text( file: IO[Any], break_on_unprocessable: bool = True, ) -> str: - if not file_name: - return file_io_to_text(file) + extension_to_function: dict[str, Callable[[IO[Any]], str]] = { + ".pdf": pdf_to_text, + ".docx": docx_to_text, + ".pptx": pptx_to_text, + ".xlsx": xlsx_to_text, + ".eml": eml_to_text, + ".epub": epub_to_text, + ".html": parse_html_page_basic, + } - extension = get_file_ext(file_name) - if not check_file_ext_is_valid(extension): + try: + if file_name: + extension = get_file_ext(file_name) + if check_file_ext_is_valid(extension): + return extension_to_function.get(extension, file_io_to_text)(file) + + if is_text_file(file): + return file_io_to_text(file) + + failure_string = "No file_name or known text encoding" if break_on_unprocessable: - raise RuntimeError(f"Unprocessable file type: {file_name}") - else: - logger.warning(f"Unprocessable file type: {file_name}") - return "" + raise RuntimeError(failure_string) - if extension == ".pdf": - return pdf_to_text(file=file) + logger.warning(failure_string) + return "" - elif extension == ".docx": - return docx_to_text(file) - - elif extension == ".pptx": - return pptx_to_text(file) - - elif extension == ".xlsx": - return xlsx_to_text(file) - - elif extension == ".eml": - return eml_to_text(file) - - elif extension == ".epub": - return epub_to_text(file) - - elif extension == ".html": - return parse_html_page_basic(file) - - else: - return file_io_to_text(file) + except Exception as e: + if break_on_unprocessable: + raise RuntimeError(failure_string) + logger.warning(str(e)) + return ""