mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-24 15:00:57 +02:00
fixed confluence breaking on unknown filetypes (#1698)
This commit is contained in:
parent
50f799edf4
commit
e65d9e155d
@ -366,7 +366,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
|
||||
if response.status_code == 200:
|
||||
extract = extract_file_text(
|
||||
attachment["title"], io.BytesIO(response.content)
|
||||
attachment["title"],
|
||||
io.BytesIO(response.content),
|
||||
break_on_unprocessable=False,
|
||||
)
|
||||
files_attachment_content.append(extract)
|
||||
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from email.parser import Parser as EmailParser
|
||||
from pathlib import Path
|
||||
@ -65,6 +66,16 @@ def check_file_ext_is_valid(ext: str) -> bool:
|
||||
return ext in VALID_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def is_text_file(file: IO[bytes]) -> bool:
|
||||
"""
|
||||
checks if the first 1024 bytes only contain printable or whitespace characters
|
||||
if it does, then we say its a 'txt'
|
||||
"""
|
||||
raw_data = file.read(1024)
|
||||
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
|
||||
return all(c in text_chars for c in raw_data)
|
||||
|
||||
|
||||
def detect_encoding(file: IO[bytes]) -> str:
|
||||
raw_data = file.read(50000)
|
||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||
@ -261,37 +272,34 @@ def extract_file_text(
|
||||
file: IO[Any],
|
||||
break_on_unprocessable: bool = True,
|
||||
) -> str:
|
||||
if not file_name:
|
||||
return file_io_to_text(file)
|
||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||
".pdf": pdf_to_text,
|
||||
".docx": docx_to_text,
|
||||
".pptx": pptx_to_text,
|
||||
".xlsx": xlsx_to_text,
|
||||
".eml": eml_to_text,
|
||||
".epub": epub_to_text,
|
||||
".html": parse_html_page_basic,
|
||||
}
|
||||
|
||||
extension = get_file_ext(file_name)
|
||||
if not check_file_ext_is_valid(extension):
|
||||
try:
|
||||
if file_name:
|
||||
extension = get_file_ext(file_name)
|
||||
if check_file_ext_is_valid(extension):
|
||||
return extension_to_function.get(extension, file_io_to_text)(file)
|
||||
|
||||
if is_text_file(file):
|
||||
return file_io_to_text(file)
|
||||
|
||||
failure_string = "No file_name or known text encoding"
|
||||
if break_on_unprocessable:
|
||||
raise RuntimeError(f"Unprocessable file type: {file_name}")
|
||||
else:
|
||||
logger.warning(f"Unprocessable file type: {file_name}")
|
||||
return ""
|
||||
raise RuntimeError(failure_string)
|
||||
|
||||
if extension == ".pdf":
|
||||
return pdf_to_text(file=file)
|
||||
logger.warning(failure_string)
|
||||
return ""
|
||||
|
||||
elif extension == ".docx":
|
||||
return docx_to_text(file)
|
||||
|
||||
elif extension == ".pptx":
|
||||
return pptx_to_text(file)
|
||||
|
||||
elif extension == ".xlsx":
|
||||
return xlsx_to_text(file)
|
||||
|
||||
elif extension == ".eml":
|
||||
return eml_to_text(file)
|
||||
|
||||
elif extension == ".epub":
|
||||
return epub_to_text(file)
|
||||
|
||||
elif extension == ".html":
|
||||
return parse_html_page_basic(file)
|
||||
|
||||
else:
|
||||
return file_io_to_text(file)
|
||||
except Exception as e:
|
||||
if break_on_unprocessable:
|
||||
raise RuntimeError(failure_string)
|
||||
logger.warning(str(e))
|
||||
return ""
|
||||
|
Loading…
x
Reference in New Issue
Block a user