fixed confluence breaking on unknown filetypes (#1698)

This commit is contained in:
hagen-danswer 2024-06-25 10:19:01 -07:00 committed by GitHub
parent 50f799edf4
commit e65d9e155d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 41 additions and 31 deletions

View File

@ -366,7 +366,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if response.status_code == 200:
extract = extract_file_text(
attachment["title"], io.BytesIO(response.content)
attachment["title"],
io.BytesIO(response.content),
break_on_unprocessable=False,
)
files_attachment_content.append(extract)

View File

@ -3,6 +3,7 @@ import json
import os
import re
import zipfile
from collections.abc import Callable
from collections.abc import Iterator
from email.parser import Parser as EmailParser
from pathlib import Path
@ -65,6 +66,16 @@ def check_file_ext_is_valid(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS
def is_text_file(file: IO[bytes]) -> bool:
"""
checks if the first 1024 bytes only contain printable or whitespace characters
if it does, then we say its a 'txt'
"""
raw_data = file.read(1024)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
return all(c in text_chars for c in raw_data)
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
@ -261,37 +272,34 @@ def extract_file_text(
file: IO[Any],
break_on_unprocessable: bool = True,
) -> str:
if not file_name:
return file_io_to_text(file)
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text,
".docx": docx_to_text,
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,
".eml": eml_to_text,
".epub": epub_to_text,
".html": parse_html_page_basic,
}
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
try:
if file_name:
extension = get_file_ext(file_name)
if check_file_ext_is_valid(extension):
return extension_to_function.get(extension, file_io_to_text)(file)
if is_text_file(file):
return file_io_to_text(file)
failure_string = "No file_name or known text encoding"
if break_on_unprocessable:
raise RuntimeError(f"Unprocessable file type: {file_name}")
else:
logger.warning(f"Unprocessable file type: {file_name}")
return ""
raise RuntimeError(failure_string)
if extension == ".pdf":
return pdf_to_text(file=file)
logger.warning(failure_string)
return ""
elif extension == ".docx":
return docx_to_text(file)
elif extension == ".pptx":
return pptx_to_text(file)
elif extension == ".xlsx":
return xlsx_to_text(file)
elif extension == ".eml":
return eml_to_text(file)
elif extension == ".epub":
return epub_to_text(file)
elif extension == ".html":
return parse_html_page_basic(file)
else:
return file_io_to_text(file)
except Exception as e:
if break_on_unprocessable:
raise RuntimeError(failure_string)
logger.warning(str(e))
return ""