mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 11:58:34 +02:00
Consolidate File Processing (#1449)
This commit is contained in:
parent
e89c81de76
commit
546815dc8c
@ -1,6 +1,4 @@
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from celery import Celery # type: ignore
|
||||
@ -10,9 +8,7 @@ from danswer.background.connector_deletion import delete_connector_credential_pa
|
||||
from danswer.background.task_utils import build_celery_task_wrapper
|
||||
from danswer.background.task_utils import name_cc_cleanup_task
|
||||
from danswer.background.task_utils import name_document_set_sync_task
|
||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||
from danswer.configs.app_configs import JOB_TIMEOUT
|
||||
from danswer.connectors.file.utils import file_age_in_hours
|
||||
from danswer.db.connector_credential_pair import get_connector_credential_pair
|
||||
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
|
||||
from danswer.db.document import prepare_to_modify_documents
|
||||
@ -203,21 +199,6 @@ def check_for_document_sets_sync_task() -> None:
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(name="clean_old_temp_files_task", soft_time_limit=JOB_TIMEOUT)
|
||||
def clean_old_temp_files_task(
|
||||
age_threshold_in_hours: float | int = 24 * 7, # 1 week,
|
||||
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
|
||||
) -> None:
|
||||
"""Files added via the File connector need to be deleted after ingestion
|
||||
Currently handled async of the indexing job"""
|
||||
os.makedirs(base_path, exist_ok=True)
|
||||
for file in os.listdir(base_path):
|
||||
full_file_path = Path(base_path) / file
|
||||
if file_age_in_hours(full_file_path) > age_threshold_in_hours:
|
||||
logger.info(f"Cleaning up uploaded file: {full_file_path}")
|
||||
os.remove(full_file_path)
|
||||
|
||||
|
||||
#####
|
||||
# Celery Beat (Periodic Tasks) Settings
|
||||
#####
|
||||
|
@ -148,10 +148,6 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
||||
GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False
|
||||
|
||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||
)
|
||||
|
||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
||||
@ -237,10 +233,9 @@ DISABLE_DOCUMENT_CLEANUP = (
|
||||
#####
|
||||
# Miscellaneous
|
||||
#####
|
||||
DYNAMIC_CONFIG_STORE = (
|
||||
os.environ.get("DYNAMIC_CONFIG_STORE") or "PostgresBackedDynamicConfigStore"
|
||||
)
|
||||
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
|
||||
# File based Key Value store no longer used
|
||||
DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore"
|
||||
|
||||
JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
|
||||
# used to allow the background indexing jobs to use a different embedding
|
||||
# model server than the API server
|
||||
|
@ -8,7 +8,6 @@ from pydantic import BaseModel
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
process_in_batches,
|
||||
)
|
||||
@ -23,6 +22,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
|
@ -7,7 +7,6 @@ from typing import Any
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.bookstack.client import BookStackApiClient
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
@ -16,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
|
||||
|
||||
class BookstackConnector(LoadConnector, PollConnector):
|
||||
|
@ -19,7 +19,6 @@ from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.confluence.rate_limit_handler import (
|
||||
make_confluence_call_handle_rate_limit,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -28,6 +27,7 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import format_document_soup
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
@ -1,158 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from collections.abc import Iterator
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
import chardet
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def extract_metadata(line: str) -> dict | None:
|
||||
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
||||
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
||||
|
||||
html_comment_match = re.search(html_comment_pattern, line)
|
||||
hashtag_match = re.search(hashtag_pattern, line)
|
||||
|
||||
if html_comment_match:
|
||||
json_str = html_comment_match.group(1)
|
||||
elif hashtag_match:
|
||||
json_str = hashtag_match.group(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads("{" + json_str + "}")
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||
try:
|
||||
pdf_reader = PdfReader(file)
|
||||
|
||||
# If marked as encrypted and a password is provided, try to decrypt
|
||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||
else:
|
||||
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
||||
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return ""
|
||||
|
||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||
except PdfStreamError:
|
||||
logger.exception(f"PDF file {file_name} is not a valid PDF")
|
||||
except Exception:
|
||||
logger.exception(f"Failed to read PDF {file_name}")
|
||||
|
||||
# File is still discoverable by title
|
||||
# but the contents are not included as they cannot be parsed
|
||||
return ""
|
||||
|
||||
|
||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||
"__MACOSX"
|
||||
)
|
||||
|
||||
|
||||
# To include additional metadata in the search index, add a .danswer_metadata.json file
|
||||
# to the zip file. This file should contain a list of objects with the following format:
|
||||
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
||||
def load_files_from_zip(
|
||||
zip_file_io: IO,
|
||||
ignore_macos_resource_fork_files: bool = True,
|
||||
ignore_dirs: bool = True,
|
||||
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||
zip_metadata = {}
|
||||
try:
|
||||
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
|
||||
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
||||
try:
|
||||
zip_metadata = json.load(metadata_file)
|
||||
if isinstance(zip_metadata, list):
|
||||
# convert list of dicts to dict of dicts
|
||||
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
||||
except json.JSONDecodeError:
|
||||
logger.warn("Unable to load .danswer_metadata.json")
|
||||
except KeyError:
|
||||
logger.info("No .danswer_metadata.json file")
|
||||
|
||||
for file_info in zip_file.infolist():
|
||||
with zip_file.open(file_info.filename, "r") as file:
|
||||
if ignore_dirs and file_info.is_dir():
|
||||
continue
|
||||
|
||||
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
||||
file_info.filename
|
||||
):
|
||||
continue
|
||||
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
||||
|
||||
|
||||
def detect_encoding(file: IO[bytes]) -> str:
|
||||
raw_data = file.read(50000)
|
||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||
file.seek(0)
|
||||
return encoding
|
||||
|
||||
|
||||
def read_file(
|
||||
file: IO, encoding: str = "utf-8", errors: str = "replace"
|
||||
) -> tuple[str, dict]:
|
||||
metadata = {}
|
||||
file_content_raw = ""
|
||||
for ind, line in enumerate(file):
|
||||
try:
|
||||
line = line.decode(encoding) if isinstance(line, bytes) else line
|
||||
except UnicodeDecodeError:
|
||||
line = (
|
||||
line.decode(encoding, errors=errors)
|
||||
if isinstance(line, bytes)
|
||||
else line
|
||||
)
|
||||
|
||||
if ind == 0:
|
||||
metadata_or_none = extract_metadata(line)
|
||||
if metadata_or_none is not None:
|
||||
metadata = metadata_or_none
|
||||
else:
|
||||
file_content_raw += line
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
return file_content_raw, metadata
|
||||
|
||||
|
||||
def is_text_file_extension(file_name: str) -> bool:
|
||||
extensions = (
|
||||
".txt",
|
||||
".mdx",
|
||||
".md",
|
||||
".conf",
|
||||
".log",
|
||||
".json",
|
||||
".xml",
|
||||
".yaml",
|
||||
".yml",
|
||||
".json",
|
||||
)
|
||||
return any(file_name.endswith(ext) for ext in extensions)
|
@ -10,7 +10,6 @@ from requests import Response
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
@ -20,6 +19,7 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
@ -8,7 +8,6 @@ import requests
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
@ -22,6 +21,7 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
|
||||
# Limitations and Potential Improvements
|
||||
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
|
||||
|
@ -1,36 +1,30 @@
|
||||
import csv # type: ignore
|
||||
import io
|
||||
import os
|
||||
import zipfile
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from email.parser import Parser as EmailParser
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
import docx2txt # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||
from danswer.connectors.file.utils import get_file_ext
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.file_processing.extract_file_text import check_file_ext_is_valid
|
||||
from danswer.file_processing.extract_file_text import detect_encoding
|
||||
from danswer.file_processing.extract_file_text import extract_file_text
|
||||
from danswer.file_processing.extract_file_text import get_file_ext
|
||||
from danswer.file_processing.extract_file_text import is_text_file_extension
|
||||
from danswer.file_processing.extract_file_text import load_files_from_zip
|
||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||
from danswer.file_processing.extract_file_text import read_text_file
|
||||
from danswer.file_store.file_store import get_default_file_store
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
@ -54,18 +48,7 @@ def _read_files_and_metadata(
|
||||
file_content, ignore_dirs=True
|
||||
):
|
||||
yield os.path.join(directory_path, file_info.filename), file, metadata
|
||||
elif extension in [
|
||||
".txt",
|
||||
".md",
|
||||
".mdx",
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".csv",
|
||||
".eml",
|
||||
".epub",
|
||||
]:
|
||||
elif check_file_ext_is_valid(extension):
|
||||
yield file_name, file_content, metadata
|
||||
else:
|
||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||
@ -84,65 +67,20 @@ def _process_file(
|
||||
|
||||
file_metadata: dict[str, Any] = {}
|
||||
|
||||
if extension == ".pdf":
|
||||
file_content_raw = read_pdf_file(
|
||||
file=file, file_name=file_name, pdf_pass=pdf_pass
|
||||
if is_text_file_extension(file_name):
|
||||
encoding = detect_encoding(file)
|
||||
file_content_raw, file_metadata = read_text_file(file, encoding=encoding)
|
||||
|
||||
# Using the PDF reader function directly to pass in password cleanly
|
||||
elif extension == ".pdf":
|
||||
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
|
||||
|
||||
else:
|
||||
file_content_raw = extract_file_text(
|
||||
file_name=file_name,
|
||||
file=file,
|
||||
)
|
||||
|
||||
elif extension == ".docx":
|
||||
file_content_raw = docx2txt.process(file)
|
||||
|
||||
elif extension == ".pptx":
|
||||
presentation = pptx.Presentation(file)
|
||||
text_content = []
|
||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||
extracted_text = f"\nSlide {slide_number}:\n"
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
extracted_text += shape.text + "\n"
|
||||
|
||||
text_content.append(extracted_text)
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".xlsx":
|
||||
workbook = openpyxl.load_workbook(file)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".csv":
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
reader = csv.reader(text_file)
|
||||
file_content_raw = "\n".join([",".join(row) for row in reader])
|
||||
|
||||
elif extension == ".eml":
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
parser = EmailParser()
|
||||
message = parser.parse(text_file)
|
||||
|
||||
text_content = []
|
||||
for part in message.walk():
|
||||
if part.get_content_type().startswith("text/plain"):
|
||||
text_content.append(part.get_payload())
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".epub":
|
||||
with zipfile.ZipFile(file) as epub:
|
||||
text_content = []
|
||||
for item in epub.infolist():
|
||||
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
||||
with epub.open(item) as html_file:
|
||||
soup = BeautifulSoup(html_file, "html.parser")
|
||||
text_content.append(soup.get_text())
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
else:
|
||||
encoding = detect_encoding(file)
|
||||
file_content_raw, file_metadata = read_file(file, encoding=encoding)
|
||||
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
||||
|
||||
# If this is set, we will show this in the UI as the "name" of the file
|
||||
|
@ -1,66 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||
|
||||
_VALID_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".zip",
|
||||
".pdf",
|
||||
".md",
|
||||
".mdx",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".csv",
|
||||
".eml",
|
||||
".epub",
|
||||
]
|
||||
|
||||
|
||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||
_, extension = os.path.splitext(file_path_or_name)
|
||||
return extension
|
||||
|
||||
|
||||
def check_file_ext_is_valid(ext: str) -> bool:
|
||||
return ext in _VALID_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def write_temp_files(
|
||||
files: list[tuple[str, IO[Any]]],
|
||||
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
|
||||
) -> list[str]:
|
||||
"""Writes temporary files to disk and returns their paths
|
||||
|
||||
NOTE: need to pass in (file_name, File) tuples since FastAPI's `UploadFile` class
|
||||
exposed SpooledTemporaryFile does not include a name.
|
||||
"""
|
||||
file_location = Path(base_path) / str(uuid.uuid4())
|
||||
os.makedirs(file_location, exist_ok=True)
|
||||
|
||||
file_paths: list[str] = []
|
||||
for file_name, file in files:
|
||||
extension = get_file_ext(file_name)
|
||||
if not check_file_ext_is_valid(extension):
|
||||
raise ValueError(
|
||||
f"Invalid file extension for file: '{file_name}'. Must be one of {_VALID_FILE_EXTENSIONS}"
|
||||
)
|
||||
|
||||
file_path = file_location / file_name
|
||||
with open(file_path, "wb") as buffer:
|
||||
# copy file content from uploaded file to the newly created file
|
||||
shutil.copyfileobj(file, buffer)
|
||||
|
||||
file_paths.append(str(file_path.absolute()))
|
||||
|
||||
return file_paths
|
||||
|
||||
|
||||
def file_age_in_hours(filepath: str | Path) -> float:
|
||||
return (time.time() - os.path.getmtime(filepath)) / (60 * 60)
|
@ -1,5 +1,4 @@
|
||||
import io
|
||||
import tempfile
|
||||
from collections.abc import Iterator
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
@ -9,7 +8,6 @@ from itertools import chain
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import docx2txt # type:ignore
|
||||
from google.auth.credentials import Credentials # type: ignore
|
||||
from googleapiclient import discovery # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
@ -21,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
get_google_drive_creds_for_authorized_user,
|
||||
@ -42,6 +39,8 @@ from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.extract_file_text import docx_to_text
|
||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||
from danswer.utils.batching import batch_generator
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
@ -321,15 +320,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
)
|
||||
elif mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
word_stream = io.BytesIO(response)
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
||||
temp.write(word_stream.getvalue())
|
||||
temp_path = temp.name
|
||||
return docx2txt.process(temp_path)
|
||||
return docx_to_text(file=io.BytesIO(response))
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
|
||||
return file_contents
|
||||
return pdf_to_text(file=io.BytesIO(response))
|
||||
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
|
||||
|
@ -9,14 +9,14 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.file_processing.extract_file_text import load_files_from_zip
|
||||
from danswer.file_processing.extract_file_text import read_text_file
|
||||
from danswer.file_processing.html_utils import web_html_cleanup
|
||||
from danswer.file_store.file_store import get_default_file_store
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
@ -86,7 +86,7 @@ class GoogleSitesConnector(LoadConnector):
|
||||
if extension != ".html":
|
||||
continue
|
||||
|
||||
file_content, _ = read_file(file_io)
|
||||
file_content, _ = read_text_file(file_io)
|
||||
soup = BeautifulSoup(file_content, "html.parser")
|
||||
|
||||
# get the link out of the navbar
|
||||
|
@ -7,7 +7,6 @@ import requests
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
@ -17,6 +16,7 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
# Potential Improvements
|
||||
|
@ -9,10 +9,6 @@ from requests_oauthlib import OAuth2Session # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.html_utils import (
|
||||
strip_excessive_newlines_and_spaces,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
@ -22,6 +18,8 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.file_processing.html_utils import strip_excessive_newlines_and_spaces
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
LOOPIO_API_BASE = "https://api.loopio.com/"
|
||||
|
@ -1,22 +1,16 @@
|
||||
import io
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
import docx # type: ignore
|
||||
import msal # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from office365.graph_client import GraphClient # type: ignore
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
|
||||
from office365.onedrive.sites.site import Site # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import is_text_file_extension
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -25,6 +19,12 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.extract_file_text import docx_to_text
|
||||
from danswer.file_processing.extract_file_text import file_io_to_text
|
||||
from danswer.file_processing.extract_file_text import is_text_file_extension
|
||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||
from danswer.file_processing.extract_file_text import pptx_to_text
|
||||
from danswer.file_processing.extract_file_text import xlsx_to_text
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things
|
||||
@ -35,62 +35,28 @@ logger = setup_logger()
|
||||
|
||||
def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content = driveitem_object.get_content().execute_query().value
|
||||
excel_file = io.BytesIO(file_content)
|
||||
workbook = openpyxl.load_workbook(excel_file, read_only=True)
|
||||
|
||||
full_text = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
full_text.append(sheet_string)
|
||||
|
||||
return "\n".join(full_text)
|
||||
return xlsx_to_text(file=io.BytesIO(file_content))
|
||||
|
||||
|
||||
def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content = driveitem_object.get_content().execute_query().value
|
||||
full_text = []
|
||||
|
||||
with tempfile.TemporaryDirectory() as local_path:
|
||||
with open(os.path.join(local_path, driveitem_object.name), "wb") as local_file:
|
||||
local_file.write(file_content)
|
||||
doc = docx.Document(local_file.name)
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
return "\n".join(full_text)
|
||||
return docx_to_text(file=io.BytesIO(file_content))
|
||||
|
||||
|
||||
def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content = driveitem_object.get_content().execute_query().value
|
||||
file_text = read_pdf_file(
|
||||
file=io.BytesIO(file_content), file_name=driveitem_object.name
|
||||
)
|
||||
file_text = pdf_to_text(file=io.BytesIO(file_content))
|
||||
return file_text
|
||||
|
||||
|
||||
def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content: bytes = driveitem_object.get_content().execute_query().value
|
||||
text_string = file_content.decode("utf-8")
|
||||
return text_string
|
||||
return file_io_to_text(file=io.BytesIO(file_content))
|
||||
|
||||
|
||||
def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content = driveitem_object.get_content().execute_query().value
|
||||
pptx_stream = io.BytesIO(file_content)
|
||||
with tempfile.NamedTemporaryFile() as temp:
|
||||
temp.write(pptx_stream.getvalue())
|
||||
presentation = pptx.Presentation(temp.name)
|
||||
extracted_text = ""
|
||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||
extracted_text += f"\nSlide {slide_number}:\n"
|
||||
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
extracted_text += shape.text + "\n"
|
||||
|
||||
return extracted_text
|
||||
return pptx_to_text(file=io.BytesIO(file_content))
|
||||
|
||||
|
||||
class SharepointConnector(LoadConnector, PollConnector):
|
||||
|
@ -22,12 +22,12 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||
from danswer.file_processing.html_utils import web_html_cleanup
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -247,9 +247,7 @@ class WebConnector(LoadConnector):
|
||||
if current_url.split(".")[-1] == "pdf":
|
||||
# PDF files are not checked for links
|
||||
response = requests.get(current_url)
|
||||
page_text = read_pdf_file(
|
||||
file=io.BytesIO(response.content), file_name=current_url
|
||||
)
|
||||
page_text = pdf_to_text(file=io.BytesIO(response.content))
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
|
@ -5,8 +5,9 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
time_str_to_utc,
|
||||
)
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -14,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
|
||||
|
||||
def _article_to_document(article: Article) -> Document:
|
||||
|
@ -1,4 +1,3 @@
|
||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
|
||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE
|
||||
from danswer.dynamic_configs.interface import DynamicConfigStore
|
||||
from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore
|
||||
@ -8,7 +7,7 @@ from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore
|
||||
def get_dynamic_config_store() -> DynamicConfigStore:
|
||||
dynamic_config_store_type = DYNAMIC_CONFIG_STORE
|
||||
if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__:
|
||||
return FileSystemBackedDynamicConfigStore(DYNAMIC_CONFIG_DIR_PATH)
|
||||
raise NotImplementedError("File based config store no longer supported")
|
||||
if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__:
|
||||
return PostgresBackedDynamicConfigStore()
|
||||
|
||||
|
@ -2,7 +2,6 @@ import json
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
|
||||
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
|
||||
from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
|
||||
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
|
||||
@ -53,7 +52,7 @@ def insert_into_postgres(store_data: dict) -> None:
|
||||
config_store.store(port_once_key, True)
|
||||
|
||||
|
||||
def port_filesystem_to_postgres(directory_path: str = DYNAMIC_CONFIG_DIR_PATH) -> None:
|
||||
def port_filesystem_to_postgres(directory_path: str) -> None:
|
||||
store_data = read_file_system_store(directory_path)
|
||||
insert_into_postgres(store_data)
|
||||
|
||||
|
0
backend/danswer/file_processing/__init__.py
Normal file
0
backend/danswer/file_processing/__init__.py
Normal file
283
backend/danswer/file_processing/extract_file_text.py
Normal file
283
backend/danswer/file_processing/extract_file_text.py
Normal file
@ -0,0 +1,283 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from collections.abc import Iterator
|
||||
from email.parser import Parser as EmailParser
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
import chardet
|
||||
import docx # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
TEXT_SECTION_SEPARATOR = "\n\n"
|
||||
|
||||
|
||||
PLAIN_TEXT_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".md",
|
||||
".mdx",
|
||||
".conf",
|
||||
".log",
|
||||
".json",
|
||||
".csv",
|
||||
".tsv",
|
||||
".xml",
|
||||
".yml",
|
||||
".yaml",
|
||||
]
|
||||
|
||||
|
||||
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".eml",
|
||||
".epub",
|
||||
]
|
||||
|
||||
|
||||
def is_text_file_extension(file_name: str) -> bool:
|
||||
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
|
||||
|
||||
|
||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||
_, extension = os.path.splitext(file_path_or_name)
|
||||
return extension
|
||||
|
||||
|
||||
def check_file_ext_is_valid(ext: str) -> bool:
|
||||
return ext in VALID_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def detect_encoding(file: IO[bytes]) -> str:
|
||||
raw_data = file.read(50000)
|
||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||
file.seek(0)
|
||||
return encoding
|
||||
|
||||
|
||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||
"__MACOSX"
|
||||
)
|
||||
|
||||
|
||||
# To include additional metadata in the search index, add a .danswer_metadata.json file
|
||||
# to the zip file. This file should contain a list of objects with the following format:
|
||||
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
||||
def load_files_from_zip(
|
||||
zip_file_io: IO,
|
||||
ignore_macos_resource_fork_files: bool = True,
|
||||
ignore_dirs: bool = True,
|
||||
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||
zip_metadata = {}
|
||||
try:
|
||||
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
|
||||
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
||||
try:
|
||||
zip_metadata = json.load(metadata_file)
|
||||
if isinstance(zip_metadata, list):
|
||||
# convert list of dicts to dict of dicts
|
||||
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
||||
except json.JSONDecodeError:
|
||||
logger.warn("Unable to load .danswer_metadata.json")
|
||||
except KeyError:
|
||||
logger.info("No .danswer_metadata.json file")
|
||||
|
||||
for file_info in zip_file.infolist():
|
||||
with zip_file.open(file_info.filename, "r") as file:
|
||||
if ignore_dirs and file_info.is_dir():
|
||||
continue
|
||||
|
||||
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
||||
file_info.filename
|
||||
):
|
||||
continue
|
||||
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
||||
|
||||
|
||||
def _extract_danswer_metadata(line: str) -> dict | None:
|
||||
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
||||
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
||||
|
||||
html_comment_match = re.search(html_comment_pattern, line)
|
||||
hashtag_match = re.search(hashtag_pattern, line)
|
||||
|
||||
if html_comment_match:
|
||||
json_str = html_comment_match.group(1)
|
||||
elif hashtag_match:
|
||||
json_str = hashtag_match.group(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads("{" + json_str + "}")
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def read_text_file(
|
||||
file: IO,
|
||||
encoding: str = "utf-8",
|
||||
errors: str = "replace",
|
||||
ignore_danswer_metadata: bool = True,
|
||||
) -> tuple[str, dict]:
|
||||
metadata = {}
|
||||
file_content_raw = ""
|
||||
for ind, line in enumerate(file):
|
||||
try:
|
||||
line = line.decode(encoding) if isinstance(line, bytes) else line
|
||||
except UnicodeDecodeError:
|
||||
line = (
|
||||
line.decode(encoding, errors=errors)
|
||||
if isinstance(line, bytes)
|
||||
else line
|
||||
)
|
||||
|
||||
if ind == 0:
|
||||
metadata_or_none = (
|
||||
None if ignore_danswer_metadata else _extract_danswer_metadata(line)
|
||||
)
|
||||
if metadata_or_none is not None:
|
||||
metadata = metadata_or_none
|
||||
else:
|
||||
file_content_raw += line
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
return file_content_raw, metadata
|
||||
|
||||
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
try:
|
||||
pdf_reader = PdfReader(file)
|
||||
|
||||
# If marked as encrypted and a password is provided, try to decrypt
|
||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error("Unable to decrypt pdf")
|
||||
else:
|
||||
logger.info("No Password available to to decrypt pdf")
|
||||
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return ""
|
||||
|
||||
return TEXT_SECTION_SEPARATOR.join(
|
||||
page.extract_text() for page in pdf_reader.pages
|
||||
)
|
||||
except PdfStreamError:
|
||||
logger.exception("PDF file is not a valid PDF")
|
||||
except Exception:
|
||||
logger.exception("Failed to read PDF")
|
||||
|
||||
# File is still discoverable by title
|
||||
# but the contents are not included as they cannot be parsed
|
||||
return ""
|
||||
|
||||
|
||||
def docx_to_text(file: IO[Any]) -> str:
|
||||
doc = docx.Document(file)
|
||||
full_text = [para.text for para in doc.paragraphs]
|
||||
return TEXT_SECTION_SEPARATOR.join(full_text)
|
||||
|
||||
|
||||
def pptx_to_text(file: IO[Any]) -> str:
|
||||
presentation = pptx.Presentation(file)
|
||||
text_content = []
|
||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||
extracted_text = f"\nSlide {slide_number}:\n"
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
extracted_text += shape.text + "\n"
|
||||
text_content.append(extracted_text)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def xlsx_to_text(file: IO[Any]) -> str:
|
||||
workbook = openpyxl.load_workbook(file)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
parser = EmailParser()
|
||||
message = parser.parse(text_file)
|
||||
text_content = []
|
||||
for part in message.walk():
|
||||
if part.get_content_type().startswith("text/plain"):
|
||||
text_content.append(part.get_payload())
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def epub_to_text(file: IO[Any]) -> str:
|
||||
with zipfile.ZipFile(file) as epub:
|
||||
text_content = []
|
||||
for item in epub.infolist():
|
||||
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
||||
with epub.open(item) as html_file:
|
||||
text_content.append(parse_html_page_basic(html_file))
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def file_io_to_text(file: IO[Any]) -> str:
|
||||
encoding = detect_encoding(file)
|
||||
file_content_raw, _ = read_text_file(file, encoding=encoding)
|
||||
return file_content_raw
|
||||
|
||||
|
||||
def extract_file_text(
|
||||
file_name: str,
|
||||
file: IO[Any],
|
||||
) -> str:
|
||||
extension = get_file_ext(file_name)
|
||||
if not check_file_ext_is_valid(extension):
|
||||
raise RuntimeError("Unprocessable file type")
|
||||
|
||||
if extension == ".pdf":
|
||||
return pdf_to_text(file=file)
|
||||
|
||||
elif extension == ".docx":
|
||||
return docx_to_text(file)
|
||||
|
||||
elif extension == ".pptx":
|
||||
return pptx_to_text(file)
|
||||
|
||||
elif extension == ".xlsx":
|
||||
return xlsx_to_text(file)
|
||||
|
||||
elif extension == ".eml":
|
||||
return eml_to_text(file)
|
||||
|
||||
elif extension == ".epub":
|
||||
return epub_to_text(file)
|
||||
|
||||
else:
|
||||
return file_io_to_text(file)
|
@ -1,6 +1,7 @@
|
||||
import re
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
from typing import IO
|
||||
|
||||
import bs4
|
||||
|
||||
@ -118,7 +119,7 @@ def format_document_soup(
|
||||
return strip_excessive_newlines_and_spaces(text)
|
||||
|
||||
|
||||
def parse_html_page_basic(text: str) -> str:
|
||||
def parse_html_page_basic(text: str | IO[bytes]) -> str:
|
||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||
return format_document_soup(soup)
|
||||
|
@ -46,8 +46,6 @@ from danswer.db.index_attempt import cancel_indexing_attempts_past_model
|
||||
from danswer.db.index_attempt import expire_index_attempts
|
||||
from danswer.db.swap_index import check_index_swap
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.dynamic_configs.port_configs import port_api_key_to_postgres
|
||||
from danswer.dynamic_configs.port_configs import port_filesystem_to_postgres
|
||||
from danswer.search.retrieval.search_runner import download_nltk_data
|
||||
from danswer.search.search_nlp_models import warm_up_encoders
|
||||
from danswer.server.auth_check import check_router_auth
|
||||
@ -162,18 +160,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
||||
f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}"
|
||||
)
|
||||
|
||||
try:
|
||||
port_filesystem_to_postgres()
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Skipping port of persistent volumes. Maybe these have already been removed?"
|
||||
)
|
||||
|
||||
try:
|
||||
port_api_key_to_postgres()
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to port API keys. Exception: {e}. Continuing...")
|
||||
|
||||
with Session(engine) as db_session:
|
||||
check_index_swap(db_session=db_session)
|
||||
db_embedding_model = get_current_db_embedding_model(db_session)
|
||||
|
@ -30,7 +30,6 @@ llama-index==0.9.45
|
||||
Mako==1.2.4
|
||||
msal==1.26.0
|
||||
nltk==3.8.1
|
||||
docx2txt==0.8
|
||||
Office365-REST-Python-Client==2.5.4
|
||||
oauthlib==3.2.2
|
||||
openai==1.3.5
|
||||
|
@ -49,8 +49,6 @@ def run_jobs(exclude_indexing: bool) -> None:
|
||||
if not exclude_indexing:
|
||||
update_env = os.environ.copy()
|
||||
update_env["PYTHONPATH"] = "."
|
||||
update_env["DYNAMIC_CONFIG_DIR_PATH"] = "./dynamic_config_storage"
|
||||
update_env["FILE_CONNECTOR_TMP_STORAGE_PATH"] = "./dynamic_config_storage"
|
||||
cmd_indexing = ["python", "danswer/background/update.py"]
|
||||
|
||||
indexing_process = subprocess.Popen(
|
||||
|
@ -1,7 +1,7 @@
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
|
||||
|
||||
class TestQAPostprocessing(unittest.TestCase):
|
||||
|
@ -81,9 +81,6 @@ services:
|
||||
# If set to `true` will enable additional logs about Vespa query performance
|
||||
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -181,9 +178,6 @@ services:
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -229,6 +223,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -256,6 +251,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -323,11 +319,6 @@ services:
|
||||
|
||||
|
||||
volumes:
|
||||
# local_dynamic_storage is legacy only now
|
||||
local_dynamic_storage:
|
||||
# used to store files uploaded by the user temporarily while we are indexing them
|
||||
# file_connector_tmp_storage is legacy only now
|
||||
file_connector_tmp_storage:
|
||||
db_volume:
|
||||
vespa_volume:
|
||||
# Created by the container itself
|
||||
|
@ -20,6 +20,7 @@ services:
|
||||
# Auth Settings
|
||||
- AUTH_TYPE=${AUTH_TYPE:-disabled}
|
||||
- SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400}
|
||||
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
|
||||
- VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-}
|
||||
- GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-}
|
||||
- GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-}
|
||||
@ -46,6 +47,7 @@ services:
|
||||
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
|
||||
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
||||
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
||||
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
|
||||
# if set, allows for the use of the token budget system
|
||||
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
|
||||
# Enables the use of bedrock models
|
||||
@ -79,9 +81,6 @@ services:
|
||||
# If set to `true` will enable additional logs about Vespa query performance
|
||||
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -104,6 +103,7 @@ services:
|
||||
- indexing_model_server
|
||||
restart: always
|
||||
environment:
|
||||
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
|
||||
# Gen AI Settings (Needed by DanswerBot)
|
||||
- GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-}
|
||||
- GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-}
|
||||
@ -122,6 +122,7 @@ services:
|
||||
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
||||
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
|
||||
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
||||
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
|
||||
# Query Options
|
||||
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
|
||||
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
|
||||
@ -177,9 +178,6 @@ services:
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -233,6 +231,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -268,6 +267,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -335,11 +335,6 @@ services:
|
||||
|
||||
|
||||
volumes:
|
||||
# local_dynamic_storage is legacy only now
|
||||
local_dynamic_storage:
|
||||
# used to store files uploaded by the user temporarily while we are indexing them
|
||||
# file_connector_tmp_storage is legacy only now
|
||||
file_connector_tmp_storage:
|
||||
db_volume:
|
||||
vespa_volume:
|
||||
# Created by the container itself
|
||||
|
@ -21,9 +21,6 @@ services:
|
||||
- POSTGRES_HOST=relational_db
|
||||
- VESPA_HOST=index
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -53,9 +50,6 @@ services:
|
||||
- VESPA_HOST=index
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -107,6 +101,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -134,6 +129,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -205,11 +201,6 @@ services:
|
||||
|
||||
|
||||
volumes:
|
||||
# local_dynamic_storage is legacy only now
|
||||
local_dynamic_storage:
|
||||
# used to store files uploaded by the user temporarily while we are indexing them
|
||||
# file_connector_tmp_storage is legacy only now
|
||||
file_connector_tmp_storage:
|
||||
db_volume:
|
||||
vespa_volume:
|
||||
# Created by the container itself
|
||||
|
@ -21,9 +21,6 @@ services:
|
||||
- POSTGRES_HOST=relational_db
|
||||
- VESPA_HOST=index
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -53,9 +50,6 @@ services:
|
||||
- VESPA_HOST=index
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
@ -87,6 +81,8 @@ services:
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "6"
|
||||
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
restart: always
|
||||
@ -120,6 +116,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -147,6 +144,7 @@ services:
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
# Not necessary, this is just to reduce download time during startup
|
||||
- model_cache_huggingface:/root/.cache/huggingface/
|
||||
logging:
|
||||
driver: json-file
|
||||
@ -222,11 +220,6 @@ services:
|
||||
|
||||
|
||||
volumes:
|
||||
# local_dynamic_storage is legacy only now
|
||||
local_dynamic_storage:
|
||||
# used to store files uploaded by the user temporarily while we are indexing them
|
||||
# file_connector_tmp_storage is legacy only now
|
||||
file_connector_tmp_storage:
|
||||
db_volume:
|
||||
vespa_volume:
|
||||
# Created by the container itself
|
||||
|
@ -52,10 +52,12 @@ const Main = () => {
|
||||
{filesAreUploading && <Spinner />}
|
||||
<Text className="mb-2">
|
||||
Specify files below, click the <b>Upload</b> button, and the contents of
|
||||
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
|
||||
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
|
||||
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
|
||||
file types) are supported.
|
||||
these files will be searchable via Danswer! Currently supported file
|
||||
types include <i>.txt</i>, <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>,{" "}
|
||||
<i>.xlsx</i>, <i>.csv</i>, <i>.md</i>, <i>.mdx</i>, <i>.conf</i>,{" "}
|
||||
<i>.log</i>, <i>.json</i>, <i>.tsv</i>, <i>.xml</i>, <i>.yml</i>,{" "}
|
||||
<i>.yaml</i>, <i>.eml</i>, <i>.epub</i>, and finally <i>.zip</i> files
|
||||
(containing supported file types).
|
||||
</Text>
|
||||
<Text className="mb-3">
|
||||
<b>NOTE:</b> if the original document is accessible via a link, you can
|
||||
|
Loading…
x
Reference in New Issue
Block a user