fixed confluence breaking on unknown filetypes (#1698)

This commit is contained in:
hagen-danswer
2024-06-25 10:19:01 -07:00
committed by GitHub
parent 50f799edf4
commit e65d9e155d
2 changed files with 41 additions and 31 deletions

View File

@@ -366,7 +366,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if response.status_code == 200: if response.status_code == 200:
extract = extract_file_text( extract = extract_file_text(
attachment["title"], io.BytesIO(response.content) attachment["title"],
io.BytesIO(response.content),
break_on_unprocessable=False,
) )
files_attachment_content.append(extract) files_attachment_content.append(extract)

View File

@@ -3,6 +3,7 @@ import json
import os import os
import re import re
import zipfile import zipfile
from collections.abc import Callable
from collections.abc import Iterator from collections.abc import Iterator
from email.parser import Parser as EmailParser from email.parser import Parser as EmailParser
from pathlib import Path from pathlib import Path
@@ -65,6 +66,16 @@ def check_file_ext_is_valid(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS return ext in VALID_FILE_EXTENSIONS
def is_text_file(file: IO[bytes]) -> bool:
"""
checks if the first 1024 bytes only contain printable or whitespace characters
if it does, then we say its a 'txt'
"""
raw_data = file.read(1024)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
return all(c in text_chars for c in raw_data)
def detect_encoding(file: IO[bytes]) -> str: def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000) raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8" encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
@@ -261,37 +272,34 @@ def extract_file_text(
file: IO[Any], file: IO[Any],
break_on_unprocessable: bool = True, break_on_unprocessable: bool = True,
) -> str: ) -> str:
if not file_name: extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
return file_io_to_text(file) ".pdf": pdf_to_text,
".docx": docx_to_text,
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,
".eml": eml_to_text,
".epub": epub_to_text,
".html": parse_html_page_basic,
}
extension = get_file_ext(file_name) try:
if not check_file_ext_is_valid(extension): if file_name:
extension = get_file_ext(file_name)
if check_file_ext_is_valid(extension):
return extension_to_function.get(extension, file_io_to_text)(file)
if is_text_file(file):
return file_io_to_text(file)
failure_string = "No file_name or known text encoding"
if break_on_unprocessable: if break_on_unprocessable:
raise RuntimeError(f"Unprocessable file type: {file_name}") raise RuntimeError(failure_string)
else:
logger.warning(f"Unprocessable file type: {file_name}")
return ""
if extension == ".pdf": logger.warning(failure_string)
return pdf_to_text(file=file) return ""
elif extension == ".docx": except Exception as e:
return docx_to_text(file) if break_on_unprocessable:
raise RuntimeError(failure_string)
elif extension == ".pptx": logger.warning(str(e))
return pptx_to_text(file) return ""
elif extension == ".xlsx":
return xlsx_to_text(file)
elif extension == ".eml":
return eml_to_text(file)
elif extension == ".epub":
return epub_to_text(file)
elif extension == ".html":
return parse_html_page_basic(file)
else:
return file_io_to_text(file)