mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 13:15:18 +02:00
fixed confluence breaking on unknown filetypes (#1698)
This commit is contained in:
@@ -366,7 +366,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
extract = extract_file_text(
|
extract = extract_file_text(
|
||||||
attachment["title"], io.BytesIO(response.content)
|
attachment["title"],
|
||||||
|
io.BytesIO(response.content),
|
||||||
|
break_on_unprocessable=False,
|
||||||
)
|
)
|
||||||
files_attachment_content.append(extract)
|
files_attachment_content.append(extract)
|
||||||
|
|
||||||
|
@@ -3,6 +3,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from collections.abc import Callable
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from email.parser import Parser as EmailParser
|
from email.parser import Parser as EmailParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -65,6 +66,16 @@ def check_file_ext_is_valid(ext: str) -> bool:
|
|||||||
return ext in VALID_FILE_EXTENSIONS
|
return ext in VALID_FILE_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_file(file: IO[bytes]) -> bool:
|
||||||
|
"""
|
||||||
|
checks if the first 1024 bytes only contain printable or whitespace characters
|
||||||
|
if it does, then we say its a 'txt'
|
||||||
|
"""
|
||||||
|
raw_data = file.read(1024)
|
||||||
|
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
|
||||||
|
return all(c in text_chars for c in raw_data)
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(file: IO[bytes]) -> str:
|
def detect_encoding(file: IO[bytes]) -> str:
|
||||||
raw_data = file.read(50000)
|
raw_data = file.read(50000)
|
||||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||||
@@ -261,37 +272,34 @@ def extract_file_text(
|
|||||||
file: IO[Any],
|
file: IO[Any],
|
||||||
break_on_unprocessable: bool = True,
|
break_on_unprocessable: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
if not file_name:
|
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||||
return file_io_to_text(file)
|
".pdf": pdf_to_text,
|
||||||
|
".docx": docx_to_text,
|
||||||
|
".pptx": pptx_to_text,
|
||||||
|
".xlsx": xlsx_to_text,
|
||||||
|
".eml": eml_to_text,
|
||||||
|
".epub": epub_to_text,
|
||||||
|
".html": parse_html_page_basic,
|
||||||
|
}
|
||||||
|
|
||||||
extension = get_file_ext(file_name)
|
try:
|
||||||
if not check_file_ext_is_valid(extension):
|
if file_name:
|
||||||
|
extension = get_file_ext(file_name)
|
||||||
|
if check_file_ext_is_valid(extension):
|
||||||
|
return extension_to_function.get(extension, file_io_to_text)(file)
|
||||||
|
|
||||||
|
if is_text_file(file):
|
||||||
|
return file_io_to_text(file)
|
||||||
|
|
||||||
|
failure_string = "No file_name or known text encoding"
|
||||||
if break_on_unprocessable:
|
if break_on_unprocessable:
|
||||||
raise RuntimeError(f"Unprocessable file type: {file_name}")
|
raise RuntimeError(failure_string)
|
||||||
else:
|
|
||||||
logger.warning(f"Unprocessable file type: {file_name}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
if extension == ".pdf":
|
logger.warning(failure_string)
|
||||||
return pdf_to_text(file=file)
|
return ""
|
||||||
|
|
||||||
elif extension == ".docx":
|
except Exception as e:
|
||||||
return docx_to_text(file)
|
if break_on_unprocessable:
|
||||||
|
raise RuntimeError(failure_string)
|
||||||
elif extension == ".pptx":
|
logger.warning(str(e))
|
||||||
return pptx_to_text(file)
|
return ""
|
||||||
|
|
||||||
elif extension == ".xlsx":
|
|
||||||
return xlsx_to_text(file)
|
|
||||||
|
|
||||||
elif extension == ".eml":
|
|
||||||
return eml_to_text(file)
|
|
||||||
|
|
||||||
elif extension == ".epub":
|
|
||||||
return epub_to_text(file)
|
|
||||||
|
|
||||||
elif extension == ".html":
|
|
||||||
return parse_html_page_basic(file)
|
|
||||||
|
|
||||||
else:
|
|
||||||
return file_io_to_text(file)
|
|
||||||
|
Reference in New Issue
Block a user