From 546815dc8cf462a8b8aedf729fbd2897804ea5e0 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sat, 11 May 2024 23:11:22 -0700 Subject: [PATCH] Consolidate File Processing (#1449) --- backend/danswer/background/celery/celery.py | 19 -- backend/danswer/configs/app_configs.py | 11 +- backend/danswer/connectors/axero/connector.py | 2 +- .../danswer/connectors/bookstack/connector.py | 2 +- .../connectors/confluence/connector.py | 2 +- .../cross_connector_utils/file_utils.py | 158 ---------- .../danswer/connectors/discourse/connector.py | 2 +- .../connectors/document360/connector.py | 2 +- backend/danswer/connectors/file/connector.py | 104 ++----- backend/danswer/connectors/file/utils.py | 66 ---- .../connectors/google_drive/connector.py | 14 +- .../connectors/google_site/connector.py | 8 +- backend/danswer/connectors/guru/connector.py | 2 +- .../danswer/connectors/loopio/connector.py | 6 +- .../connectors/sharepoint/connector.py | 56 +--- backend/danswer/connectors/web/connector.py | 8 +- .../danswer/connectors/zendesk/connector.py | 6 +- backend/danswer/dynamic_configs/factory.py | 3 +- .../danswer/dynamic_configs/port_configs.py | 3 +- backend/danswer/file_processing/__init__.py | 0 .../file_processing/extract_file_text.py | 283 ++++++++++++++++++ .../html_utils.py | 3 +- backend/danswer/main.py | 14 - backend/requirements/default.txt | 1 - backend/scripts/dev_run_background_jobs.py | 2 - .../cross_connector_utils/test_html_utils.py | 2 +- .../docker_compose/docker-compose.dev.yml | 13 +- .../docker_compose/docker-compose.gpu-dev.yml | 17 +- .../docker-compose.prod-no-letsencrypt.yml | 13 +- .../docker_compose/docker-compose.prod.yml | 15 +- web/src/app/admin/connectors/file/page.tsx | 10 +- 31 files changed, 366 insertions(+), 481 deletions(-) delete mode 100644 backend/danswer/connectors/cross_connector_utils/file_utils.py delete mode 100644 backend/danswer/connectors/file/utils.py create mode 100644 backend/danswer/file_processing/__init__.py create mode 100644 backend/danswer/file_processing/extract_file_text.py rename backend/danswer/{connectors/cross_connector_utils => file_processing}/html_utils.py (98%) diff --git a/backend/danswer/background/celery/celery.py b/backend/danswer/background/celery/celery.py index d66b0cfee..b112b9279 100644 --- a/backend/danswer/background/celery/celery.py +++ b/backend/danswer/background/celery/celery.py @@ -1,6 +1,4 @@ -import os from datetime import timedelta -from pathlib import Path from typing import cast from celery import Celery # type: ignore @@ -10,9 +8,7 @@ from danswer.background.connector_deletion import delete_connector_credential_pa from danswer.background.task_utils import build_celery_task_wrapper from danswer.background.task_utils import name_cc_cleanup_task from danswer.background.task_utils import name_document_set_sync_task -from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH from danswer.configs.app_configs import JOB_TIMEOUT -from danswer.connectors.file.utils import file_age_in_hours from danswer.db.connector_credential_pair import get_connector_credential_pair from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed from danswer.db.document import prepare_to_modify_documents @@ -203,21 +199,6 @@ def check_for_document_sets_sync_task() -> None: ) -@celery_app.task(name="clean_old_temp_files_task", soft_time_limit=JOB_TIMEOUT) -def clean_old_temp_files_task( - age_threshold_in_hours: float | int = 24 * 7, # 1 week, - base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH, -) -> None: - """Files added via the File connector need to be deleted after ingestion - Currently handled async of the indexing job""" - os.makedirs(base_path, exist_ok=True) - for file in os.listdir(base_path): - full_file_path = Path(base_path) / file - if file_age_in_hours(full_file_path) > age_threshold_in_hours: - logger.info(f"Cleaning up uploaded file: {full_file_path}") - os.remove(full_file_path) - - ##### # Celery Beat (Periodic Tasks) Settings ##### diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 88575300c..e33962971 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -148,10 +148,6 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False -FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( - "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage" -) - # TODO these should be available for frontend configuration, via advanced options expandable WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer" @@ -237,10 +233,9 @@ DISABLE_DOCUMENT_CLEANUP = ( ##### # Miscellaneous ##### -DYNAMIC_CONFIG_STORE = ( - os.environ.get("DYNAMIC_CONFIG_STORE") or "PostgresBackedDynamicConfigStore" -) -DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage") +# File based Key Value store no longer used +DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore" + JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default # used to allow the background indexing jobs to use a different embedding # model server than the API server diff --git a/backend/danswer/connectors/axero/connector.py b/backend/danswer/connectors/axero/connector.py index f82c6b449..a4d5162b6 100644 --- a/backend/danswer/connectors/axero/connector.py +++ b/backend/danswer/connectors/axero/connector.py @@ -8,7 +8,6 @@ from pydantic import BaseModel from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( process_in_batches, ) @@ -23,6 +22,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index 606866b42..f2e692d2c 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -7,7 +7,6 @@ from typing import Any from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.bookstack.client import BookStackApiClient -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -16,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic class BookstackConnector(LoadConnector, PollConnector): diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index b0272d44c..a20dd4779 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -19,7 +19,6 @@ from danswer.configs.constants import DocumentSource from danswer.connectors.confluence.rate_limit_handler import ( make_confluence_call_handle_rate_limit, ) -from danswer.connectors.cross_connector_utils.html_utils import format_document_soup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -28,6 +27,7 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import format_document_soup from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py deleted file mode 100644 index c7f662d9a..000000000 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ /dev/null @@ -1,158 +0,0 @@ -import json -import os -import re -import zipfile -from collections.abc import Iterator -from typing import Any -from typing import IO - -import chardet -from pypdf import PdfReader -from pypdf.errors import PdfStreamError - -from danswer.utils.logger import setup_logger - - -logger = setup_logger() - - -def extract_metadata(line: str) -> dict | None: - html_comment_pattern = r"" - hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}" - - html_comment_match = re.search(html_comment_pattern, line) - hashtag_match = re.search(hashtag_pattern, line) - - if html_comment_match: - json_str = html_comment_match.group(1) - elif hashtag_match: - json_str = hashtag_match.group(1) - else: - return None - - try: - return json.loads("{" + json_str + "}") - except json.JSONDecodeError: - return None - - -def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: - try: - pdf_reader = PdfReader(file) - - # If marked as encrypted and a password is provided, try to decrypt - if pdf_reader.is_encrypted and pdf_pass is not None: - decrypt_success = False - if pdf_pass is not None: - try: - decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 - except Exception: - logger.error(f"Unable to decrypt pdf {file_name}") - else: - logger.info(f"No Password available to to decrypt pdf {file_name}") - - if not decrypt_success: - # By user request, keep files that are unreadable just so they - # can be discoverable by title. - return "" - - return "\n".join(page.extract_text() for page in pdf_reader.pages) - except PdfStreamError: - logger.exception(f"PDF file {file_name} is not a valid PDF") - except Exception: - logger.exception(f"Failed to read PDF {file_name}") - - # File is still discoverable by title - # but the contents are not included as they cannot be parsed - return "" - - -def is_macos_resource_fork_file(file_name: str) -> bool: - return os.path.basename(file_name).startswith("._") and file_name.startswith( - "__MACOSX" - ) - - -# To include additional metadata in the search index, add a .danswer_metadata.json file -# to the zip file. This file should contain a list of objects with the following format: -# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }] -def load_files_from_zip( - zip_file_io: IO, - ignore_macos_resource_fork_files: bool = True, - ignore_dirs: bool = True, -) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]: - with zipfile.ZipFile(zip_file_io, "r") as zip_file: - zip_metadata = {} - try: - metadata_file_info = zip_file.getinfo(".danswer_metadata.json") - with zip_file.open(metadata_file_info, "r") as metadata_file: - try: - zip_metadata = json.load(metadata_file) - if isinstance(zip_metadata, list): - # convert list of dicts to dict of dicts - zip_metadata = {d["filename"]: d for d in zip_metadata} - except json.JSONDecodeError: - logger.warn("Unable to load .danswer_metadata.json") - except KeyError: - logger.info("No .danswer_metadata.json file") - - for file_info in zip_file.infolist(): - with zip_file.open(file_info.filename, "r") as file: - if ignore_dirs and file_info.is_dir(): - continue - - if ignore_macos_resource_fork_files and is_macos_resource_fork_file( - file_info.filename - ): - continue - yield file_info, file, zip_metadata.get(file_info.filename, {}) - - -def detect_encoding(file: IO[bytes]) -> str: - raw_data = file.read(50000) - encoding = chardet.detect(raw_data)["encoding"] or "utf-8" - file.seek(0) - return encoding - - -def read_file( - file: IO, encoding: str = "utf-8", errors: str = "replace" -) -> tuple[str, dict]: - metadata = {} - file_content_raw = "" - for ind, line in enumerate(file): - try: - line = line.decode(encoding) if isinstance(line, bytes) else line - except UnicodeDecodeError: - line = ( - line.decode(encoding, errors=errors) - if isinstance(line, bytes) - else line - ) - - if ind == 0: - metadata_or_none = extract_metadata(line) - if metadata_or_none is not None: - metadata = metadata_or_none - else: - file_content_raw += line - else: - file_content_raw += line - - return file_content_raw, metadata - - -def is_text_file_extension(file_name: str) -> bool: - extensions = ( - ".txt", - ".mdx", - ".md", - ".conf", - ".log", - ".json", - ".xml", - ".yaml", - ".yml", - ".json", - ) - return any(file_name.endswith(ext) for ext in extensions) diff --git a/backend/danswer/connectors/discourse/connector.py b/backend/danswer/connectors/discourse/connector.py index 471fc04ed..47d817d11 100644 --- a/backend/danswer/connectors/discourse/connector.py +++ b/backend/danswer/connectors/discourse/connector.py @@ -10,7 +10,6 @@ from requests import Response from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -20,6 +19,7 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py index aab6d4dfa..6a9f4ba6a 100644 --- a/backend/danswer/connectors/document360/connector.py +++ b/backend/danswer/connectors/document360/connector.py @@ -8,7 +8,6 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) @@ -22,6 +21,7 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic # Limitations and Potential Improvements # 1. The "Categories themselves contain potentially relevant information" but they're not pulled in diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 37a916382..2e6a9081d 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -1,36 +1,30 @@ -import csv # type: ignore -import io import os -import zipfile from collections.abc import Iterator from datetime import datetime from datetime import timezone -from email.parser import Parser as EmailParser from pathlib import Path from typing import Any from typing import IO -import docx2txt # type: ignore -import openpyxl # type: ignore -import pptx # type: ignore -from bs4 import BeautifulSoup from sqlalchemy.orm import Session from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.file_utils import detect_encoding -from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip -from danswer.connectors.cross_connector_utils.file_utils import read_file -from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc -from danswer.connectors.file.utils import check_file_ext_is_valid -from danswer.connectors.file.utils import get_file_ext from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.db.engine import get_sqlalchemy_engine +from danswer.file_processing.extract_file_text import check_file_ext_is_valid +from danswer.file_processing.extract_file_text import detect_encoding +from danswer.file_processing.extract_file_text import extract_file_text +from danswer.file_processing.extract_file_text import get_file_ext +from danswer.file_processing.extract_file_text import is_text_file_extension +from danswer.file_processing.extract_file_text import load_files_from_zip +from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.extract_file_text import read_text_file from danswer.file_store.file_store import get_default_file_store from danswer.utils.logger import setup_logger @@ -54,18 +48,7 @@ def _read_files_and_metadata( file_content, ignore_dirs=True ): yield os.path.join(directory_path, file_info.filename), file, metadata - elif extension in [ - ".txt", - ".md", - ".mdx", - ".pdf", - ".docx", - ".pptx", - ".xlsx", - ".csv", - ".eml", - ".epub", - ]: + elif check_file_ext_is_valid(extension): yield file_name, file_content, metadata else: logger.warning(f"Skipping file '{file_name}' with extension '{extension}'") @@ -84,65 +67,20 @@ def _process_file( file_metadata: dict[str, Any] = {} - if extension == ".pdf": - file_content_raw = read_pdf_file( - file=file, file_name=file_name, pdf_pass=pdf_pass + if is_text_file_extension(file_name): + encoding = detect_encoding(file) + file_content_raw, file_metadata = read_text_file(file, encoding=encoding) + + # Using the PDF reader function directly to pass in password cleanly + elif extension == ".pdf": + file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass) + + else: + file_content_raw = extract_file_text( + file_name=file_name, + file=file, ) - elif extension == ".docx": - file_content_raw = docx2txt.process(file) - - elif extension == ".pptx": - presentation = pptx.Presentation(file) - text_content = [] - for slide_number, slide in enumerate(presentation.slides, start=1): - extracted_text = f"\nSlide {slide_number}:\n" - for shape in slide.shapes: - if hasattr(shape, "text"): - extracted_text += shape.text + "\n" - - text_content.append(extracted_text) - file_content_raw = "\n\n".join(text_content) - - elif extension == ".xlsx": - workbook = openpyxl.load_workbook(file) - text_content = [] - for sheet in workbook.worksheets: - sheet_string = "\n".join( - ",".join(map(str, row)) - for row in sheet.iter_rows(min_row=1, values_only=True) - ) - text_content.append(sheet_string) - file_content_raw = "\n\n".join(text_content) - - elif extension == ".csv": - text_file = io.TextIOWrapper(file, encoding=detect_encoding(file)) - reader = csv.reader(text_file) - file_content_raw = "\n".join([",".join(row) for row in reader]) - - elif extension == ".eml": - text_file = io.TextIOWrapper(file, encoding=detect_encoding(file)) - parser = EmailParser() - message = parser.parse(text_file) - - text_content = [] - for part in message.walk(): - if part.get_content_type().startswith("text/plain"): - text_content.append(part.get_payload()) - file_content_raw = "\n\n".join(text_content) - - elif extension == ".epub": - with zipfile.ZipFile(file) as epub: - text_content = [] - for item in epub.infolist(): - if item.filename.endswith(".xhtml") or item.filename.endswith(".html"): - with epub.open(item) as html_file: - soup = BeautifulSoup(html_file, "html.parser") - text_content.append(soup.get_text()) - file_content_raw = "\n\n".join(text_content) - else: - encoding = detect_encoding(file) - file_content_raw, file_metadata = read_file(file, encoding=encoding) all_metadata = {**metadata, **file_metadata} if metadata else file_metadata # If this is set, we will show this in the UI as the "name" of the file diff --git a/backend/danswer/connectors/file/utils.py b/backend/danswer/connectors/file/utils.py deleted file mode 100644 index e5f6d61a9..000000000 --- a/backend/danswer/connectors/file/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import shutil -import time -import uuid -from pathlib import Path -from typing import Any -from typing import IO - -from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH - -_VALID_FILE_EXTENSIONS = [ - ".txt", - ".zip", - ".pdf", - ".md", - ".mdx", - ".docx", - ".pptx", - ".xlsx", - ".csv", - ".eml", - ".epub", -] - - -def get_file_ext(file_path_or_name: str | Path) -> str: - _, extension = os.path.splitext(file_path_or_name) - return extension - - -def check_file_ext_is_valid(ext: str) -> bool: - return ext in _VALID_FILE_EXTENSIONS - - -def write_temp_files( - files: list[tuple[str, IO[Any]]], - base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH, -) -> list[str]: - """Writes temporary files to disk and returns their paths - - NOTE: need to pass in (file_name, File) tuples since FastAPI's `UploadFile` class - exposed SpooledTemporaryFile does not include a name. - """ - file_location = Path(base_path) / str(uuid.uuid4()) - os.makedirs(file_location, exist_ok=True) - - file_paths: list[str] = [] - for file_name, file in files: - extension = get_file_ext(file_name) - if not check_file_ext_is_valid(extension): - raise ValueError( - f"Invalid file extension for file: '{file_name}'. Must be one of {_VALID_FILE_EXTENSIONS}" - ) - - file_path = file_location / file_name - with open(file_path, "wb") as buffer: - # copy file content from uploaded file to the newly created file - shutil.copyfileobj(file, buffer) - - file_paths.append(str(file_path.absolute())) - - return file_paths - - -def file_age_in_hours(filepath: str | Path) -> float: - return (time.time() - os.path.getmtime(filepath)) / (60 * 60) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index ea7ef60db..73a541267 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,5 +1,4 @@ import io -import tempfile from collections.abc import Iterator from collections.abc import Sequence from datetime import datetime @@ -9,7 +8,6 @@ from itertools import chain from typing import Any from typing import cast -import docx2txt # type:ignore from google.auth.credentials import Credentials # type: ignore from googleapiclient import discovery # type: ignore from googleapiclient.errors import HttpError # type: ignore @@ -21,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.configs.constants import IGNORE_FOR_QA -from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.google_drive.connector_auth import ( get_google_drive_creds_for_authorized_user, @@ -42,6 +39,8 @@ from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.extract_file_text import docx_to_text +from danswer.file_processing.extract_file_text import pdf_to_text from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger @@ -321,15 +320,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: ) elif mime_type == GDriveMimeType.WORD_DOC.value: response = service.files().get_media(fileId=file["id"]).execute() - word_stream = io.BytesIO(response) - with tempfile.NamedTemporaryFile(delete=False) as temp: - temp.write(word_stream.getvalue()) - temp_path = temp.name - return docx2txt.process(temp_path) + return docx_to_text(file=io.BytesIO(response)) elif mime_type == GDriveMimeType.PDF.value: response = service.files().get_media(fileId=file["id"]).execute() - file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"]) - return file_contents + return pdf_to_text(file=io.BytesIO(response)) return UNSUPPORTED_FILE_TYPE_CONTENT diff --git a/backend/danswer/connectors/google_site/connector.py b/backend/danswer/connectors/google_site/connector.py index b963c7c02..9cfcf224e 100644 --- a/backend/danswer/connectors/google_site/connector.py +++ b/backend/danswer/connectors/google_site/connector.py @@ -9,14 +9,14 @@ from sqlalchemy.orm import Session from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip -from danswer.connectors.cross_connector_utils.file_utils import read_file -from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.db.engine import get_sqlalchemy_engine +from danswer.file_processing.extract_file_text import load_files_from_zip +from danswer.file_processing.extract_file_text import read_text_file +from danswer.file_processing.html_utils import web_html_cleanup from danswer.file_store.file_store import get_default_file_store from danswer.utils.logger import setup_logger @@ -86,7 +86,7 @@ class GoogleSitesConnector(LoadConnector): if extension != ".html": continue - file_content, _ = read_file(file_io) + file_content, _ = read_text_file(file_io) soup = BeautifulSoup(file_content, "html.parser") # get the link out of the navbar diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index b51b1d4e8..3c3c873b1 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -7,7 +7,6 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -17,6 +16,7 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger # Potential Improvements diff --git a/backend/danswer/connectors/loopio/connector.py b/backend/danswer/connectors/loopio/connector.py index 503d6bd3f..e10bed876 100644 --- a/backend/danswer/connectors/loopio/connector.py +++ b/backend/danswer/connectors/loopio/connector.py @@ -9,10 +9,6 @@ from requests_oauthlib import OAuth2Session # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic -from danswer.connectors.cross_connector_utils.html_utils import ( - strip_excessive_newlines_and_spaces, -) from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -22,6 +18,8 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.file_processing.html_utils import strip_excessive_newlines_and_spaces from danswer.utils.logger import setup_logger LOOPIO_API_BASE = "https://api.loopio.com/" diff --git a/backend/danswer/connectors/sharepoint/connector.py b/backend/danswer/connectors/sharepoint/connector.py index 61502011a..d0e98f2fd 100644 --- a/backend/danswer/connectors/sharepoint/connector.py +++ b/backend/danswer/connectors/sharepoint/connector.py @@ -1,22 +1,16 @@ import io import os -import tempfile from datetime import datetime from datetime import timezone from typing import Any -import docx # type: ignore import msal # type: ignore -import openpyxl # type: ignore -import pptx # type: ignore from office365.graph_client import GraphClient # type: ignore from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore from office365.onedrive.sites.site import Site # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.file_utils import is_text_file_extension -from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -25,6 +19,12 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.extract_file_text import docx_to_text +from danswer.file_processing.extract_file_text import file_io_to_text +from danswer.file_processing.extract_file_text import is_text_file_extension +from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.extract_file_text import pptx_to_text +from danswer.file_processing.extract_file_text import xlsx_to_text from danswer.utils.logger import setup_logger UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things @@ -35,62 +35,28 @@ logger = setup_logger() def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str: file_content = driveitem_object.get_content().execute_query().value - excel_file = io.BytesIO(file_content) - workbook = openpyxl.load_workbook(excel_file, read_only=True) - - full_text = [] - for sheet in workbook.worksheets: - sheet_string = "\n".join( - ",".join(map(str, row)) - for row in sheet.iter_rows(min_row=1, values_only=True) - ) - full_text.append(sheet_string) - - return "\n".join(full_text) + return xlsx_to_text(file=io.BytesIO(file_content)) def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str: file_content = driveitem_object.get_content().execute_query().value - full_text = [] - - with tempfile.TemporaryDirectory() as local_path: - with open(os.path.join(local_path, driveitem_object.name), "wb") as local_file: - local_file.write(file_content) - doc = docx.Document(local_file.name) - for para in doc.paragraphs: - full_text.append(para.text) - return "\n".join(full_text) + return docx_to_text(file=io.BytesIO(file_content)) def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str: file_content = driveitem_object.get_content().execute_query().value - file_text = read_pdf_file( - file=io.BytesIO(file_content), file_name=driveitem_object.name - ) + file_text = pdf_to_text(file=io.BytesIO(file_content)) return file_text def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str: file_content: bytes = driveitem_object.get_content().execute_query().value - text_string = file_content.decode("utf-8") - return text_string + return file_io_to_text(file=io.BytesIO(file_content)) def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str: file_content = driveitem_object.get_content().execute_query().value - pptx_stream = io.BytesIO(file_content) - with tempfile.NamedTemporaryFile() as temp: - temp.write(pptx_stream.getvalue()) - presentation = pptx.Presentation(temp.name) - extracted_text = "" - for slide_number, slide in enumerate(presentation.slides, start=1): - extracted_text += f"\nSlide {slide_number}:\n" - - for shape in slide.shapes: - if hasattr(shape, "text"): - extracted_text += shape.text + "\n" - - return extracted_text + return pptx_to_text(file=io.BytesIO(file_content)) class SharepointConnector(LoadConnector, PollConnector): diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 355b8392d..1a0c7e39d 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -22,12 +22,12 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file -from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.html_utils import web_html_cleanup from danswer.utils.logger import setup_logger logger = setup_logger() @@ -247,9 +247,7 @@ class WebConnector(LoadConnector): if current_url.split(".")[-1] == "pdf": # PDF files are not checked for links response = requests.get(current_url) - page_text = read_pdf_file( - file=io.BytesIO(response.content), file_name=current_url - ) + page_text = pdf_to_text(file=io.BytesIO(response.content)) doc_batch.append( Document( diff --git a/backend/danswer/connectors/zendesk/connector.py b/backend/danswer/connectors/zendesk/connector.py index da4c8e5b9..fc9b703c6 100644 --- a/backend/danswer/connectors/zendesk/connector.py +++ b/backend/danswer/connectors/zendesk/connector.py @@ -5,8 +5,9 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic -from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc +from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( + time_str_to_utc, +) from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -14,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic def _article_to_document(article: Article) -> Document: diff --git a/backend/danswer/dynamic_configs/factory.py b/backend/danswer/dynamic_configs/factory.py index a82bc315c..44b6e096b 100644 --- a/backend/danswer/dynamic_configs/factory.py +++ b/backend/danswer/dynamic_configs/factory.py @@ -1,4 +1,3 @@ -from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE from danswer.dynamic_configs.interface import DynamicConfigStore from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore @@ -8,7 +7,7 @@ from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore def get_dynamic_config_store() -> DynamicConfigStore: dynamic_config_store_type = DYNAMIC_CONFIG_STORE if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__: - return FileSystemBackedDynamicConfigStore(DYNAMIC_CONFIG_DIR_PATH) + raise NotImplementedError("File based config store no longer supported") if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__: return PostgresBackedDynamicConfigStore() diff --git a/backend/danswer/dynamic_configs/port_configs.py b/backend/danswer/dynamic_configs/port_configs.py index b28615a62..809c06cbf 100644 --- a/backend/danswer/dynamic_configs/port_configs.py +++ b/backend/danswer/dynamic_configs/port_configs.py @@ -2,7 +2,6 @@ import json from pathlib import Path from typing import cast -from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION from danswer.configs.model_configs import GEN_AI_API_ENDPOINT @@ -53,7 +52,7 @@ def insert_into_postgres(store_data: dict) -> None: config_store.store(port_once_key, True) -def port_filesystem_to_postgres(directory_path: str = DYNAMIC_CONFIG_DIR_PATH) -> None: +def port_filesystem_to_postgres(directory_path: str) -> None: store_data = read_file_system_store(directory_path) insert_into_postgres(store_data) diff --git a/backend/danswer/file_processing/__init__.py b/backend/danswer/file_processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py new file mode 100644 index 000000000..05989a539 --- /dev/null +++ b/backend/danswer/file_processing/extract_file_text.py @@ -0,0 +1,283 @@ +import io +import json +import os +import re +import zipfile +from collections.abc import Iterator +from email.parser import Parser as EmailParser +from pathlib import Path +from typing import Any +from typing import IO + +import chardet +import docx # type: ignore +import openpyxl # type: ignore +import pptx # type: ignore +from pypdf import PdfReader +from pypdf.errors import PdfStreamError + +from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +TEXT_SECTION_SEPARATOR = "\n\n" + + +PLAIN_TEXT_FILE_EXTENSIONS = [ + ".txt", + ".md", + ".mdx", + ".conf", + ".log", + ".json", + ".csv", + ".tsv", + ".xml", + ".yml", + ".yaml", +] + + +VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [ + ".pdf", + ".docx", + ".pptx", + ".xlsx", + ".eml", + ".epub", +] + + +def is_text_file_extension(file_name: str) -> bool: + return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS) + + +def get_file_ext(file_path_or_name: str | Path) -> str: + _, extension = os.path.splitext(file_path_or_name) + return extension + + +def check_file_ext_is_valid(ext: str) -> bool: + return ext in VALID_FILE_EXTENSIONS + + +def detect_encoding(file: IO[bytes]) -> str: + raw_data = file.read(50000) + encoding = chardet.detect(raw_data)["encoding"] or "utf-8" + file.seek(0) + return encoding + + +def is_macos_resource_fork_file(file_name: str) -> bool: + return os.path.basename(file_name).startswith("._") and file_name.startswith( + "__MACOSX" + ) + + +# To include additional metadata in the search index, add a .danswer_metadata.json file +# to the zip file. This file should contain a list of objects with the following format: +# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }] +def load_files_from_zip( + zip_file_io: IO, + ignore_macos_resource_fork_files: bool = True, + ignore_dirs: bool = True, +) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]: + with zipfile.ZipFile(zip_file_io, "r") as zip_file: + zip_metadata = {} + try: + metadata_file_info = zip_file.getinfo(".danswer_metadata.json") + with zip_file.open(metadata_file_info, "r") as metadata_file: + try: + zip_metadata = json.load(metadata_file) + if isinstance(zip_metadata, list): + # convert list of dicts to dict of dicts + zip_metadata = {d["filename"]: d for d in zip_metadata} + except json.JSONDecodeError: + logger.warn("Unable to load .danswer_metadata.json") + except KeyError: + logger.info("No .danswer_metadata.json file") + + for file_info in zip_file.infolist(): + with zip_file.open(file_info.filename, "r") as file: + if ignore_dirs and file_info.is_dir(): + continue + + if ignore_macos_resource_fork_files and is_macos_resource_fork_file( + file_info.filename + ): + continue + yield file_info, file, zip_metadata.get(file_info.filename, {}) + + +def _extract_danswer_metadata(line: str) -> dict | None: + html_comment_pattern = r"" + hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}" + + html_comment_match = re.search(html_comment_pattern, line) + hashtag_match = re.search(hashtag_pattern, line) + + if html_comment_match: + json_str = html_comment_match.group(1) + elif hashtag_match: + json_str = hashtag_match.group(1) + else: + return None + + try: + return json.loads("{" + json_str + "}") + except json.JSONDecodeError: + return None + + +def read_text_file( + file: IO, + encoding: str = "utf-8", + errors: str = "replace", + ignore_danswer_metadata: bool = True, +) -> tuple[str, dict]: + metadata = {} + file_content_raw = "" + for ind, line in enumerate(file): + try: + line = line.decode(encoding) if isinstance(line, bytes) else line + except UnicodeDecodeError: + line = ( + line.decode(encoding, errors=errors) + if isinstance(line, bytes) + else line + ) + + if ind == 0: + metadata_or_none = ( + None if ignore_danswer_metadata else _extract_danswer_metadata(line) + ) + if metadata_or_none is not None: + metadata = metadata_or_none + else: + file_content_raw += line + else: + file_content_raw += line + + return file_content_raw, metadata + + +def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: + try: + pdf_reader = PdfReader(file) + + # If marked as encrypted and a password is provided, try to decrypt + if pdf_reader.is_encrypted and pdf_pass is not None: + decrypt_success = False + if pdf_pass is not None: + try: + decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 + except Exception: + logger.error("Unable to decrypt pdf") + else: + logger.info("No Password available to to decrypt pdf") + + if not decrypt_success: + # By user request, keep files that are unreadable just so they + # can be discoverable by title. + return "" + + return TEXT_SECTION_SEPARATOR.join( + page.extract_text() for page in pdf_reader.pages + ) + except PdfStreamError: + logger.exception("PDF file is not a valid PDF") + except Exception: + logger.exception("Failed to read PDF") + + # File is still discoverable by title + # but the contents are not included as they cannot be parsed + return "" + + +def docx_to_text(file: IO[Any]) -> str: + doc = docx.Document(file) + full_text = [para.text for para in doc.paragraphs] + return TEXT_SECTION_SEPARATOR.join(full_text) + + +def pptx_to_text(file: IO[Any]) -> str: + presentation = pptx.Presentation(file) + text_content = [] + for slide_number, slide in enumerate(presentation.slides, start=1): + extracted_text = f"\nSlide {slide_number}:\n" + for shape in slide.shapes: + if hasattr(shape, "text"): + extracted_text += shape.text + "\n" + text_content.append(extracted_text) + return TEXT_SECTION_SEPARATOR.join(text_content) + + +def xlsx_to_text(file: IO[Any]) -> str: + workbook = openpyxl.load_workbook(file) + text_content = [] + for sheet in workbook.worksheets: + sheet_string = "\n".join( + ",".join(map(str, row)) + for row in sheet.iter_rows(min_row=1, values_only=True) + ) + text_content.append(sheet_string) + return TEXT_SECTION_SEPARATOR.join(text_content) + + +def eml_to_text(file: IO[Any]) -> str: + text_file = io.TextIOWrapper(file, encoding=detect_encoding(file)) + parser = EmailParser() + message = parser.parse(text_file) + text_content = [] + for part in message.walk(): + if part.get_content_type().startswith("text/plain"): + text_content.append(part.get_payload()) + return TEXT_SECTION_SEPARATOR.join(text_content) + + +def epub_to_text(file: IO[Any]) -> str: + with zipfile.ZipFile(file) as epub: + text_content = [] + for item in epub.infolist(): + if item.filename.endswith(".xhtml") or item.filename.endswith(".html"): + with epub.open(item) as html_file: + text_content.append(parse_html_page_basic(html_file)) + return TEXT_SECTION_SEPARATOR.join(text_content) + + +def file_io_to_text(file: IO[Any]) -> str: + encoding = detect_encoding(file) + file_content_raw, _ = read_text_file(file, encoding=encoding) + return file_content_raw + + +def extract_file_text( + file_name: str, + file: IO[Any], +) -> str: + extension = get_file_ext(file_name) + if not check_file_ext_is_valid(extension): + raise RuntimeError("Unprocessable file type") + + if extension == ".pdf": + return pdf_to_text(file=file) + + elif extension == ".docx": + return docx_to_text(file) + + elif extension == ".pptx": + return pptx_to_text(file) + + elif extension == ".xlsx": + return xlsx_to_text(file) + + elif extension == ".eml": + return eml_to_text(file) + + elif extension == ".epub": + return epub_to_text(file) + + else: + return file_io_to_text(file) diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/file_processing/html_utils.py similarity index 98% rename from backend/danswer/connectors/cross_connector_utils/html_utils.py rename to backend/danswer/file_processing/html_utils.py index 0b4e9fade..9b5875227 100644 --- a/backend/danswer/connectors/cross_connector_utils/html_utils.py +++ b/backend/danswer/file_processing/html_utils.py @@ -1,6 +1,7 @@ import re from copy import copy from dataclasses import dataclass +from typing import IO import bs4 @@ -118,7 +119,7 @@ def format_document_soup( return strip_excessive_newlines_and_spaces(text) -def parse_html_page_basic(text: str) -> str: +def parse_html_page_basic(text: str | IO[bytes]) -> str: soup = bs4.BeautifulSoup(text, "html.parser") return format_document_soup(soup) diff --git a/backend/danswer/main.py b/backend/danswer/main.py index e07dd95ba..81dcd9bb6 100644 --- a/backend/danswer/main.py +++ b/backend/danswer/main.py @@ -46,8 +46,6 @@ from danswer.db.index_attempt import cancel_indexing_attempts_past_model from danswer.db.index_attempt import expire_index_attempts from danswer.db.swap_index import check_index_swap from danswer.document_index.factory import get_default_document_index -from danswer.dynamic_configs.port_configs import port_api_key_to_postgres -from danswer.dynamic_configs.port_configs import port_filesystem_to_postgres from danswer.search.retrieval.search_runner import download_nltk_data from danswer.search.search_nlp_models import warm_up_encoders from danswer.server.auth_check import check_router_auth @@ -162,18 +160,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}" ) - try: - port_filesystem_to_postgres() - except Exception: - logger.debug( - "Skipping port of persistent volumes. Maybe these have already been removed?" - ) - - try: - port_api_key_to_postgres() - except Exception as e: - logger.debug(f"Failed to port API keys. Exception: {e}. Continuing...") - with Session(engine) as db_session: check_index_swap(db_session=db_session) db_embedding_model = get_current_db_embedding_model(db_session) diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 48505cbe4..b51d1bb53 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -30,7 +30,6 @@ llama-index==0.9.45 Mako==1.2.4 msal==1.26.0 nltk==3.8.1 -docx2txt==0.8 Office365-REST-Python-Client==2.5.4 oauthlib==3.2.2 openai==1.3.5 diff --git a/backend/scripts/dev_run_background_jobs.py b/backend/scripts/dev_run_background_jobs.py index 30fb4bf6f..c9b91b00c 100644 --- a/backend/scripts/dev_run_background_jobs.py +++ b/backend/scripts/dev_run_background_jobs.py @@ -49,8 +49,6 @@ def run_jobs(exclude_indexing: bool) -> None: if not exclude_indexing: update_env = os.environ.copy() update_env["PYTHONPATH"] = "." - update_env["DYNAMIC_CONFIG_DIR_PATH"] = "./dynamic_config_storage" - update_env["FILE_CONNECTOR_TMP_STORAGE_PATH"] = "./dynamic_config_storage" cmd_indexing = ["python", "danswer/background/update.py"] indexing_process = subprocess.Popen( diff --git a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py index 8c502269d..860001e15 100644 --- a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py +++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py @@ -1,7 +1,7 @@ import pathlib import unittest -from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic +from danswer.file_processing.html_utils import parse_html_page_basic class TestQAPostprocessing(unittest.TestCase): diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index 939d3f578..02d5e7947 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -81,9 +81,6 @@ services: # If set to `true` will enable additional logs about Vespa query performance # (time spent on finding the right docs + time spent fetching summaries from disk) - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -181,9 +178,6 @@ services: - LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -229,6 +223,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -256,6 +251,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -323,11 +319,6 @@ services: volumes: - # local_dynamic_storage is legacy only now - local_dynamic_storage: - # used to store files uploaded by the user temporarily while we are indexing them - # file_connector_tmp_storage is legacy only now - file_connector_tmp_storage: db_volume: vespa_volume: # Created by the container itself diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml index 85c921c0a..f6c2c7fdd 100644 --- a/deployment/docker_compose/docker-compose.gpu-dev.yml +++ b/deployment/docker_compose/docker-compose.gpu-dev.yml @@ -20,6 +20,7 @@ services: # Auth Settings - AUTH_TYPE=${AUTH_TYPE:-disabled} - SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400} + - ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-} - VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-} - GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-} - GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-} @@ -46,6 +47,7 @@ services: - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} + - LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-} # if set, allows for the use of the token budget system - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-} # Enables the use of bedrock models @@ -79,9 +81,6 @@ services: # If set to `true` will enable additional logs about Vespa query performance # (time spent on finding the right docs + time spent fetching summaries from disk) - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -104,6 +103,7 @@ services: - indexing_model_server restart: always environment: + - ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-} # Gen AI Settings (Needed by DanswerBot) - GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-} - GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-} @@ -122,6 +122,7 @@ services: - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-} - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} + - LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-} # Query Options - DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years) - HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector) @@ -177,9 +178,6 @@ services: - LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -233,6 +231,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -268,6 +267,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -335,11 +335,6 @@ services: volumes: - # local_dynamic_storage is legacy only now - local_dynamic_storage: - # used to store files uploaded by the user temporarily while we are indexing them - # file_connector_tmp_storage is legacy only now - file_connector_tmp_storage: db_volume: vespa_volume: # Created by the container itself diff --git a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml index 255bf6feb..87e9d615a 100644 --- a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml +++ b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml @@ -21,9 +21,6 @@ services: - POSTGRES_HOST=relational_db - VESPA_HOST=index - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -53,9 +50,6 @@ services: - VESPA_HOST=index - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -107,6 +101,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -134,6 +129,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -205,11 +201,6 @@ services: volumes: - # local_dynamic_storage is legacy only now - local_dynamic_storage: - # used to store files uploaded by the user temporarily while we are indexing them - # file_connector_tmp_storage is legacy only now - file_connector_tmp_storage: db_volume: vespa_volume: # Created by the container itself diff --git a/deployment/docker_compose/docker-compose.prod.yml b/deployment/docker_compose/docker-compose.prod.yml index 082401ffc..146f803aa 100644 --- a/deployment/docker_compose/docker-compose.prod.yml +++ b/deployment/docker_compose/docker-compose.prod.yml @@ -21,9 +21,6 @@ services: - POSTGRES_HOST=relational_db - VESPA_HOST=index - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -53,9 +50,6 @@ services: - VESPA_HOST=index - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} - volumes: - - local_dynamic_storage:/home/storage - - file_connector_tmp_storage:/home/file_connector_storage extra_hosts: - "host.docker.internal:host-gateway" logging: @@ -87,6 +81,8 @@ services: options: max-size: "50m" max-file: "6" + + relational_db: image: postgres:15.2-alpine restart: always @@ -120,6 +116,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -147,6 +144,7 @@ services: # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: + # Not necessary, this is just to reduce download time during startup - model_cache_huggingface:/root/.cache/huggingface/ logging: driver: json-file @@ -222,11 +220,6 @@ services: volumes: - # local_dynamic_storage is legacy only now - local_dynamic_storage: - # used to store files uploaded by the user temporarily while we are indexing them - # file_connector_tmp_storage is legacy only now - file_connector_tmp_storage: db_volume: vespa_volume: # Created by the container itself diff --git a/web/src/app/admin/connectors/file/page.tsx b/web/src/app/admin/connectors/file/page.tsx index a92ff8a16..a8193729e 100644 --- a/web/src/app/admin/connectors/file/page.tsx +++ b/web/src/app/admin/connectors/file/page.tsx @@ -52,10 +52,12 @@ const Main = () => { {filesAreUploading && } Specify files below, click the Upload button, and the contents of - these files will be searchable via Danswer! Currently .txt,{" "} - .pdf, .docx, .pptx, .xlxs, .csv,{" "} - .eml, .epub, and .zip files (containing supported - file types) are supported. + these files will be searchable via Danswer! Currently supported file + types include .txt, .pdf, .docx, .pptx,{" "} + .xlsx, .csv, .md, .mdx, .conf,{" "} + .log, .json, .tsv, .xml, .yml,{" "} + .yaml, .eml, .epub, and finally .zip files + (containing supported file types). NOTE: if the original document is accessible via a link, you can