Consolidate File Processing (#1449)

This commit is contained in:
Yuhong Sun
2024-05-11 23:11:22 -07:00
committed by GitHub
parent e89c81de76
commit 546815dc8c
31 changed files with 366 additions and 481 deletions

View File

@ -1,6 +1,4 @@
import os
from datetime import timedelta from datetime import timedelta
from pathlib import Path
from typing import cast from typing import cast
from celery import Celery # type: ignore from celery import Celery # type: ignore
@ -10,9 +8,7 @@ from danswer.background.connector_deletion import delete_connector_credential_pa
from danswer.background.task_utils import build_celery_task_wrapper from danswer.background.task_utils import build_celery_task_wrapper
from danswer.background.task_utils import name_cc_cleanup_task from danswer.background.task_utils import name_cc_cleanup_task
from danswer.background.task_utils import name_document_set_sync_task from danswer.background.task_utils import name_document_set_sync_task
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
from danswer.configs.app_configs import JOB_TIMEOUT from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.connectors.file.utils import file_age_in_hours
from danswer.db.connector_credential_pair import get_connector_credential_pair from danswer.db.connector_credential_pair import get_connector_credential_pair
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
from danswer.db.document import prepare_to_modify_documents from danswer.db.document import prepare_to_modify_documents
@ -203,21 +199,6 @@ def check_for_document_sets_sync_task() -> None:
) )
@celery_app.task(name="clean_old_temp_files_task", soft_time_limit=JOB_TIMEOUT)
def clean_old_temp_files_task(
age_threshold_in_hours: float | int = 24 * 7, # 1 week,
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
) -> None:
"""Files added via the File connector need to be deleted after ingestion
Currently handled async of the indexing job"""
os.makedirs(base_path, exist_ok=True)
for file in os.listdir(base_path):
full_file_path = Path(base_path) / file
if file_age_in_hours(full_file_path) > age_threshold_in_hours:
logger.info(f"Cleaning up uploaded file: {full_file_path}")
os.remove(full_file_path)
##### #####
# Celery Beat (Periodic Tasks) Settings # Celery Beat (Periodic Tasks) Settings
##### #####

View File

@ -148,10 +148,6 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
# TODO these should be available for frontend configuration, via advanced options expandable # TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer" "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
@ -237,10 +233,9 @@ DISABLE_DOCUMENT_CLEANUP = (
##### #####
# Miscellaneous # Miscellaneous
##### #####
DYNAMIC_CONFIG_STORE = ( # File based Key Value store no longer used
os.environ.get("DYNAMIC_CONFIG_STORE") or "PostgresBackedDynamicConfigStore" DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore"
)
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
# used to allow the background indexing jobs to use a different embedding # used to allow the background indexing jobs to use a different embedding
# model server than the API server # model server than the API server

View File

@ -8,7 +8,6 @@ from pydantic import BaseModel
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
process_in_batches, process_in_batches,
) )
@ -23,6 +22,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger

View File

@ -7,7 +7,6 @@ from typing import Any
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.bookstack.client import BookStackApiClient from danswer.connectors.bookstack.client import BookStackApiClient
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
@ -16,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
class BookstackConnector(LoadConnector, PollConnector): class BookstackConnector(LoadConnector, PollConnector):

View File

@ -19,7 +19,6 @@ from danswer.configs.constants import DocumentSource
from danswer.connectors.confluence.rate_limit_handler import ( from danswer.connectors.confluence.rate_limit_handler import (
make_confluence_call_handle_rate_limit, make_confluence_call_handle_rate_limit,
) )
from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
@ -28,6 +27,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import format_document_soup
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()

View File

@ -1,158 +0,0 @@
import json
import os
import re
import zipfile
from collections.abc import Iterator
from typing import Any
from typing import IO
import chardet
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.utils.logger import setup_logger
logger = setup_logger()
def extract_metadata(line: str) -> dict | None:
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
html_comment_match = re.search(html_comment_pattern, line)
hashtag_match = re.search(hashtag_pattern, line)
if html_comment_match:
json_str = html_comment_match.group(1)
elif hashtag_match:
json_str = hashtag_match.group(1)
else:
return None
try:
return json.loads("{" + json_str + "}")
except json.JSONDecodeError:
return None
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
else:
logger.info(f"No Password available to to decrypt pdf {file_name}")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return "\n".join(page.extract_text() for page in pdf_reader.pages)
except PdfStreamError:
logger.exception(f"PDF file {file_name} is not a valid PDF")
except Exception:
logger.exception(f"Failed to read PDF {file_name}")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"
)
# To include additional metadata in the search index, add a .danswer_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
with zip_file.open(metadata_file_info, "r") as metadata_file:
try:
zip_metadata = json.load(metadata_file)
if isinstance(zip_metadata, list):
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json")
except KeyError:
logger.info("No .danswer_metadata.json file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding
def read_file(
file: IO, encoding: str = "utf-8", errors: str = "replace"
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
line = (
line.decode(encoding, errors=errors)
if isinstance(line, bytes)
else line
)
if ind == 0:
metadata_or_none = extract_metadata(line)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
return file_content_raw, metadata
def is_text_file_extension(file_name: str) -> bool:
extensions = (
".txt",
".mdx",
".md",
".conf",
".log",
".json",
".xml",
".yaml",
".yml",
".json",
)
return any(file_name.endswith(ext) for ext in extensions)

View File

@ -10,7 +10,6 @@ from requests import Response
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
@ -20,6 +19,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()

View File

@ -8,7 +8,6 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder, rate_limit_builder,
) )
@ -22,6 +21,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
# Limitations and Potential Improvements # Limitations and Potential Improvements
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in # 1. The "Categories themselves contain potentially relevant information" but they're not pulled in

View File

@ -1,36 +1,30 @@
import csv # type: ignore
import io
import os import os
import zipfile
from collections.abc import Iterator from collections.abc import Iterator
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from email.parser import Parser as EmailParser
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from typing import IO from typing import IO
import docx2txt # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.file.utils import check_file_ext_is_valid
from danswer.connectors.file.utils import get_file_ext
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.db.engine import get_sqlalchemy_engine from danswer.db.engine import get_sqlalchemy_engine
from danswer.file_processing.extract_file_text import check_file_ext_is_valid
from danswer.file_processing.extract_file_text import detect_encoding
from danswer.file_processing.extract_file_text import extract_file_text
from danswer.file_processing.extract_file_text import get_file_ext
from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_store.file_store import get_default_file_store from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@ -54,18 +48,7 @@ def _read_files_and_metadata(
file_content, ignore_dirs=True file_content, ignore_dirs=True
): ):
yield os.path.join(directory_path, file_info.filename), file, metadata yield os.path.join(directory_path, file_info.filename), file, metadata
elif extension in [ elif check_file_ext_is_valid(extension):
".txt",
".md",
".mdx",
".pdf",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]:
yield file_name, file_content, metadata yield file_name, file_content, metadata
else: else:
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'") logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@ -84,65 +67,20 @@ def _process_file(
file_metadata: dict[str, Any] = {} file_metadata: dict[str, Any] = {}
if extension == ".pdf": if is_text_file_extension(file_name):
file_content_raw = read_pdf_file( encoding = detect_encoding(file)
file=file, file_name=file_name, pdf_pass=pdf_pass file_content_raw, file_metadata = read_text_file(file, encoding=encoding)
# Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf":
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
else:
file_content_raw = extract_file_text(
file_name=file_name,
file=file,
) )
elif extension == ".docx":
file_content_raw = docx2txt.process(file)
elif extension == ".pptx":
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
file_content_raw = "\n\n".join(text_content)
elif extension == ".xlsx":
workbook = openpyxl.load_workbook(file)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
file_content_raw = "\n\n".join(text_content)
elif extension == ".csv":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
reader = csv.reader(text_file)
file_content_raw = "\n".join([",".join(row) for row in reader])
elif extension == ".eml":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
text_content.append(part.get_payload())
file_content_raw = "\n\n".join(text_content)
elif extension == ".epub":
with zipfile.ZipFile(file) as epub:
text_content = []
for item in epub.infolist():
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
with epub.open(item) as html_file:
soup = BeautifulSoup(html_file, "html.parser")
text_content.append(soup.get_text())
file_content_raw = "\n\n".join(text_content)
else:
encoding = detect_encoding(file)
file_content_raw, file_metadata = read_file(file, encoding=encoding)
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
# If this is set, we will show this in the UI as the "name" of the file # If this is set, we will show this in the UI as the "name" of the file

View File

@ -1,66 +0,0 @@
import os
import shutil
import time
import uuid
from pathlib import Path
from typing import Any
from typing import IO
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
_VALID_FILE_EXTENSIONS = [
".txt",
".zip",
".pdf",
".md",
".mdx",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
return extension
def check_file_ext_is_valid(ext: str) -> bool:
return ext in _VALID_FILE_EXTENSIONS
def write_temp_files(
files: list[tuple[str, IO[Any]]],
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
) -> list[str]:
"""Writes temporary files to disk and returns their paths
NOTE: need to pass in (file_name, File) tuples since FastAPI's `UploadFile` class
exposed SpooledTemporaryFile does not include a name.
"""
file_location = Path(base_path) / str(uuid.uuid4())
os.makedirs(file_location, exist_ok=True)
file_paths: list[str] = []
for file_name, file in files:
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
raise ValueError(
f"Invalid file extension for file: '{file_name}'. Must be one of {_VALID_FILE_EXTENSIONS}"
)
file_path = file_location / file_name
with open(file_path, "wb") as buffer:
# copy file content from uploaded file to the newly created file
shutil.copyfileobj(file, buffer)
file_paths.append(str(file_path.absolute()))
return file_paths
def file_age_in_hours(filepath: str | Path) -> float:
return (time.time() - os.path.getmtime(filepath)) / (60 * 60)

View File

@ -1,5 +1,4 @@
import io import io
import tempfile
from collections.abc import Iterator from collections.abc import Iterator
from collections.abc import Sequence from collections.abc import Sequence
from datetime import datetime from datetime import datetime
@ -9,7 +8,6 @@ from itertools import chain
from typing import Any from typing import Any
from typing import cast from typing import cast
import docx2txt # type:ignore
from google.auth.credentials import Credentials # type: ignore from google.auth.credentials import Credentials # type: ignore
from googleapiclient import discovery # type: ignore from googleapiclient import discovery # type: ignore
from googleapiclient.errors import HttpError # type: ignore from googleapiclient.errors import HttpError # type: ignore
@ -21,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.configs.constants import IGNORE_FOR_QA from danswer.configs.constants import IGNORE_FOR_QA
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.google_drive.connector_auth import ( from danswer.connectors.google_drive.connector_auth import (
get_google_drive_creds_for_authorized_user, get_google_drive_creds_for_authorized_user,
@ -42,6 +39,8 @@ from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.utils.batching import batch_generator from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@ -321,15 +320,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
) )
elif mime_type == GDriveMimeType.WORD_DOC.value: elif mime_type == GDriveMimeType.WORD_DOC.value:
response = service.files().get_media(fileId=file["id"]).execute() response = service.files().get_media(fileId=file["id"]).execute()
word_stream = io.BytesIO(response) return docx_to_text(file=io.BytesIO(response))
with tempfile.NamedTemporaryFile(delete=False) as temp:
temp.write(word_stream.getvalue())
temp_path = temp.name
return docx2txt.process(temp_path)
elif mime_type == GDriveMimeType.PDF.value: elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute() response = service.files().get_media(fileId=file["id"]).execute()
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"]) return pdf_to_text(file=io.BytesIO(response))
return file_contents
return UNSUPPORTED_FILE_TYPE_CONTENT return UNSUPPORTED_FILE_TYPE_CONTENT

View File

@ -9,14 +9,14 @@ from sqlalchemy.orm import Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.db.engine import get_sqlalchemy_engine from danswer.db.engine import get_sqlalchemy_engine
from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.file_store.file_store import get_default_file_store from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@ -86,7 +86,7 @@ class GoogleSitesConnector(LoadConnector):
if extension != ".html": if extension != ".html":
continue continue
file_content, _ = read_file(file_io) file_content, _ = read_text_file(file_io)
soup = BeautifulSoup(file_content, "html.parser") soup = BeautifulSoup(file_content, "html.parser")
# get the link out of the navbar # get the link out of the navbar

View File

@ -7,7 +7,6 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
@ -17,6 +16,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
# Potential Improvements # Potential Improvements

View File

@ -9,10 +9,6 @@ from requests_oauthlib import OAuth2Session # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.html_utils import (
strip_excessive_newlines_and_spaces,
)
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
@ -22,6 +18,8 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.file_processing.html_utils import strip_excessive_newlines_and_spaces
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
LOOPIO_API_BASE = "https://api.loopio.com/" LOOPIO_API_BASE = "https://api.loopio.com/"

View File

@ -1,22 +1,16 @@
import io import io
import os import os
import tempfile
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
import docx # type: ignore
import msal # type: ignore import msal # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from office365.graph_client import GraphClient # type: ignore from office365.graph_client import GraphClient # type: ignore
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
from office365.onedrive.sites.site import Site # type: ignore from office365.onedrive.sites.site import Site # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import is_text_file_extension
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
@ -25,6 +19,12 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import file_io_to_text
from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import xlsx_to_text
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things
@ -35,62 +35,28 @@ logger = setup_logger()
def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str: def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value file_content = driveitem_object.get_content().execute_query().value
excel_file = io.BytesIO(file_content) return xlsx_to_text(file=io.BytesIO(file_content))
workbook = openpyxl.load_workbook(excel_file, read_only=True)
full_text = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
full_text.append(sheet_string)
return "\n".join(full_text)
def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str: def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value file_content = driveitem_object.get_content().execute_query().value
full_text = [] return docx_to_text(file=io.BytesIO(file_content))
with tempfile.TemporaryDirectory() as local_path:
with open(os.path.join(local_path, driveitem_object.name), "wb") as local_file:
local_file.write(file_content)
doc = docx.Document(local_file.name)
for para in doc.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str: def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value file_content = driveitem_object.get_content().execute_query().value
file_text = read_pdf_file( file_text = pdf_to_text(file=io.BytesIO(file_content))
file=io.BytesIO(file_content), file_name=driveitem_object.name
)
return file_text return file_text
def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str: def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str:
file_content: bytes = driveitem_object.get_content().execute_query().value file_content: bytes = driveitem_object.get_content().execute_query().value
text_string = file_content.decode("utf-8") return file_io_to_text(file=io.BytesIO(file_content))
return text_string
def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str: def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value file_content = driveitem_object.get_content().execute_query().value
pptx_stream = io.BytesIO(file_content) return pptx_to_text(file=io.BytesIO(file_content))
with tempfile.NamedTemporaryFile() as temp:
temp.write(pptx_stream.getvalue())
presentation = pptx.Presentation(temp.name)
extracted_text = ""
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text += f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
return extracted_text
class SharepointConnector(LoadConnector, PollConnector): class SharepointConnector(LoadConnector, PollConnector):

View File

@ -22,12 +22,12 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
@ -247,9 +247,7 @@ class WebConnector(LoadConnector):
if current_url.split(".")[-1] == "pdf": if current_url.split(".")[-1] == "pdf":
# PDF files are not checked for links # PDF files are not checked for links
response = requests.get(current_url) response = requests.get(current_url)
page_text = read_pdf_file( page_text = pdf_to_text(file=io.BytesIO(response.content))
file=io.BytesIO(response.content), file_name=current_url
)
doc_batch.append( doc_batch.append(
Document( Document(

View File

@ -5,8 +5,9 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc time_str_to_utc,
)
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
@ -14,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
def _article_to_document(article: Article) -> Document: def _article_to_document(article: Article) -> Document:

View File

@ -1,4 +1,3 @@
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE
from danswer.dynamic_configs.interface import DynamicConfigStore from danswer.dynamic_configs.interface import DynamicConfigStore
from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore
@ -8,7 +7,7 @@ from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore
def get_dynamic_config_store() -> DynamicConfigStore: def get_dynamic_config_store() -> DynamicConfigStore:
dynamic_config_store_type = DYNAMIC_CONFIG_STORE dynamic_config_store_type = DYNAMIC_CONFIG_STORE
if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__: if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__:
return FileSystemBackedDynamicConfigStore(DYNAMIC_CONFIG_DIR_PATH) raise NotImplementedError("File based config store no longer supported")
if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__: if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__:
return PostgresBackedDynamicConfigStore() return PostgresBackedDynamicConfigStore()

View File

@ -2,7 +2,6 @@ import json
from pathlib import Path from pathlib import Path
from typing import cast from typing import cast
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
@ -53,7 +52,7 @@ def insert_into_postgres(store_data: dict) -> None:
config_store.store(port_once_key, True) config_store.store(port_once_key, True)
def port_filesystem_to_postgres(directory_path: str = DYNAMIC_CONFIG_DIR_PATH) -> None: def port_filesystem_to_postgres(directory_path: str) -> None:
store_data = read_file_system_store(directory_path) store_data = read_file_system_store(directory_path)
insert_into_postgres(store_data) insert_into_postgres(store_data)

View File

@ -0,0 +1,283 @@
import io
import json
import os
import re
import zipfile
from collections.abc import Iterator
from email.parser import Parser as EmailParser
from pathlib import Path
from typing import Any
from typing import IO
import chardet
import docx # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
logger = setup_logger()
TEXT_SECTION_SEPARATOR = "\n\n"
PLAIN_TEXT_FILE_EXTENSIONS = [
".txt",
".md",
".mdx",
".conf",
".log",
".json",
".csv",
".tsv",
".xml",
".yml",
".yaml",
]
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".pdf",
".docx",
".pptx",
".xlsx",
".eml",
".epub",
]
def is_text_file_extension(file_name: str) -> bool:
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
return extension
def check_file_ext_is_valid(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"
)
# To include additional metadata in the search index, add a .danswer_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
with zip_file.open(metadata_file_info, "r") as metadata_file:
try:
zip_metadata = json.load(metadata_file)
if isinstance(zip_metadata, list):
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json")
except KeyError:
logger.info("No .danswer_metadata.json file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
def _extract_danswer_metadata(line: str) -> dict | None:
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
html_comment_match = re.search(html_comment_pattern, line)
hashtag_match = re.search(hashtag_pattern, line)
if html_comment_match:
json_str = html_comment_match.group(1)
elif hashtag_match:
json_str = hashtag_match.group(1)
else:
return None
try:
return json.loads("{" + json_str + "}")
except json.JSONDecodeError:
return None
def read_text_file(
file: IO,
encoding: str = "utf-8",
errors: str = "replace",
ignore_danswer_metadata: bool = True,
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
line = (
line.decode(encoding, errors=errors)
if isinstance(line, bytes)
else line
)
if ind == 0:
metadata_or_none = (
None if ignore_danswer_metadata else _extract_danswer_metadata(line)
)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
else:
logger.info("No Password available to to decrypt pdf")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
)
except PdfStreamError:
logger.exception("PDF file is not a valid PDF")
except Exception:
logger.exception("Failed to read PDF")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""
def docx_to_text(file: IO[Any]) -> str:
doc = docx.Document(file)
full_text = [para.text for para in doc.paragraphs]
return TEXT_SECTION_SEPARATOR.join(full_text)
def pptx_to_text(file: IO[Any]) -> str:
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
return TEXT_SECTION_SEPARATOR.join(text_content)
def xlsx_to_text(file: IO[Any]) -> str:
workbook = openpyxl.load_workbook(file)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
return TEXT_SECTION_SEPARATOR.join(text_content)
def eml_to_text(file: IO[Any]) -> str:
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
text_content.append(part.get_payload())
return TEXT_SECTION_SEPARATOR.join(text_content)
def epub_to_text(file: IO[Any]) -> str:
with zipfile.ZipFile(file) as epub:
text_content = []
for item in epub.infolist():
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
with epub.open(item) as html_file:
text_content.append(parse_html_page_basic(html_file))
return TEXT_SECTION_SEPARATOR.join(text_content)
def file_io_to_text(file: IO[Any]) -> str:
encoding = detect_encoding(file)
file_content_raw, _ = read_text_file(file, encoding=encoding)
return file_content_raw
def extract_file_text(
file_name: str,
file: IO[Any],
) -> str:
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
raise RuntimeError("Unprocessable file type")
if extension == ".pdf":
return pdf_to_text(file=file)
elif extension == ".docx":
return docx_to_text(file)
elif extension == ".pptx":
return pptx_to_text(file)
elif extension == ".xlsx":
return xlsx_to_text(file)
elif extension == ".eml":
return eml_to_text(file)
elif extension == ".epub":
return epub_to_text(file)
else:
return file_io_to_text(file)

View File

@ -1,6 +1,7 @@
import re import re
from copy import copy from copy import copy
from dataclasses import dataclass from dataclasses import dataclass
from typing import IO
import bs4 import bs4
@ -118,7 +119,7 @@ def format_document_soup(
return strip_excessive_newlines_and_spaces(text) return strip_excessive_newlines_and_spaces(text)
def parse_html_page_basic(text: str) -> str: def parse_html_page_basic(text: str | IO[bytes]) -> str:
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
return format_document_soup(soup) return format_document_soup(soup)

View File

@ -46,8 +46,6 @@ from danswer.db.index_attempt import cancel_indexing_attempts_past_model
from danswer.db.index_attempt import expire_index_attempts from danswer.db.index_attempt import expire_index_attempts
from danswer.db.swap_index import check_index_swap from danswer.db.swap_index import check_index_swap
from danswer.document_index.factory import get_default_document_index from danswer.document_index.factory import get_default_document_index
from danswer.dynamic_configs.port_configs import port_api_key_to_postgres
from danswer.dynamic_configs.port_configs import port_filesystem_to_postgres
from danswer.search.retrieval.search_runner import download_nltk_data from danswer.search.retrieval.search_runner import download_nltk_data
from danswer.search.search_nlp_models import warm_up_encoders from danswer.search.search_nlp_models import warm_up_encoders
from danswer.server.auth_check import check_router_auth from danswer.server.auth_check import check_router_auth
@ -162,18 +160,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}" f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}"
) )
try:
port_filesystem_to_postgres()
except Exception:
logger.debug(
"Skipping port of persistent volumes. Maybe these have already been removed?"
)
try:
port_api_key_to_postgres()
except Exception as e:
logger.debug(f"Failed to port API keys. Exception: {e}. Continuing...")
with Session(engine) as db_session: with Session(engine) as db_session:
check_index_swap(db_session=db_session) check_index_swap(db_session=db_session)
db_embedding_model = get_current_db_embedding_model(db_session) db_embedding_model = get_current_db_embedding_model(db_session)

View File

@ -30,7 +30,6 @@ llama-index==0.9.45
Mako==1.2.4 Mako==1.2.4
msal==1.26.0 msal==1.26.0
nltk==3.8.1 nltk==3.8.1
docx2txt==0.8
Office365-REST-Python-Client==2.5.4 Office365-REST-Python-Client==2.5.4
oauthlib==3.2.2 oauthlib==3.2.2
openai==1.3.5 openai==1.3.5

View File

@ -49,8 +49,6 @@ def run_jobs(exclude_indexing: bool) -> None:
if not exclude_indexing: if not exclude_indexing:
update_env = os.environ.copy() update_env = os.environ.copy()
update_env["PYTHONPATH"] = "." update_env["PYTHONPATH"] = "."
update_env["DYNAMIC_CONFIG_DIR_PATH"] = "./dynamic_config_storage"
update_env["FILE_CONNECTOR_TMP_STORAGE_PATH"] = "./dynamic_config_storage"
cmd_indexing = ["python", "danswer/background/update.py"] cmd_indexing = ["python", "danswer/background/update.py"]
indexing_process = subprocess.Popen( indexing_process = subprocess.Popen(

View File

@ -1,7 +1,7 @@
import pathlib import pathlib
import unittest import unittest
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.file_processing.html_utils import parse_html_page_basic
class TestQAPostprocessing(unittest.TestCase): class TestQAPostprocessing(unittest.TestCase):

View File

@ -81,9 +81,6 @@ services:
# If set to `true` will enable additional logs about Vespa query performance # If set to `true` will enable additional logs about Vespa query performance
# (time spent on finding the right docs + time spent fetching summaries from disk) # (time spent on finding the right docs + time spent fetching summaries from disk)
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -181,9 +178,6 @@ services:
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -229,6 +223,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -256,6 +251,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -323,11 +319,6 @@ services:
volumes: volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume: db_volume:
vespa_volume: vespa_volume:
# Created by the container itself # Created by the container itself

View File

@ -20,6 +20,7 @@ services:
# Auth Settings # Auth Settings
- AUTH_TYPE=${AUTH_TYPE:-disabled} - AUTH_TYPE=${AUTH_TYPE:-disabled}
- SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400} - SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400}
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
- VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-} - VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-}
- GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-} - GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-}
- GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-} - GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-}
@ -46,6 +47,7 @@ services:
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
# if set, allows for the use of the token budget system # if set, allows for the use of the token budget system
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-} - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
# Enables the use of bedrock models # Enables the use of bedrock models
@ -79,9 +81,6 @@ services:
# If set to `true` will enable additional logs about Vespa query performance # If set to `true` will enable additional logs about Vespa query performance
# (time spent on finding the right docs + time spent fetching summaries from disk) # (time spent on finding the right docs + time spent fetching summaries from disk)
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -104,6 +103,7 @@ services:
- indexing_model_server - indexing_model_server
restart: always restart: always
environment: environment:
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
# Gen AI Settings (Needed by DanswerBot) # Gen AI Settings (Needed by DanswerBot)
- GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-} - GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-}
- GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-} - GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-}
@ -122,6 +122,7 @@ services:
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-} - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
# Query Options # Query Options
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years) - DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector) - HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
@ -177,9 +178,6 @@ services:
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-} - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -233,6 +231,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -268,6 +267,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -335,11 +335,6 @@ services:
volumes: volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume: db_volume:
vespa_volume: vespa_volume:
# Created by the container itself # Created by the container itself

View File

@ -21,9 +21,6 @@ services:
- POSTGRES_HOST=relational_db - POSTGRES_HOST=relational_db
- VESPA_HOST=index - VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -53,9 +50,6 @@ services:
- VESPA_HOST=index - VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -107,6 +101,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -134,6 +129,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -205,11 +201,6 @@ services:
volumes: volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume: db_volume:
vespa_volume: vespa_volume:
# Created by the container itself # Created by the container itself

View File

@ -21,9 +21,6 @@ services:
- POSTGRES_HOST=relational_db - POSTGRES_HOST=relational_db
- VESPA_HOST=index - VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -53,9 +50,6 @@ services:
- VESPA_HOST=index - VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
logging: logging:
@ -87,6 +81,8 @@ services:
options: options:
max-size: "50m" max-size: "50m"
max-file: "6" max-file: "6"
relational_db: relational_db:
image: postgres:15.2-alpine image: postgres:15.2-alpine
restart: always restart: always
@ -120,6 +116,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -147,6 +144,7 @@ services:
# Set to debug to get more fine-grained logs # Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info} - LOG_LEVEL=${LOG_LEVEL:-info}
volumes: volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/ - model_cache_huggingface:/root/.cache/huggingface/
logging: logging:
driver: json-file driver: json-file
@ -222,11 +220,6 @@ services:
volumes: volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume: db_volume:
vespa_volume: vespa_volume:
# Created by the container itself # Created by the container itself

View File

@ -52,10 +52,12 @@ const Main = () => {
{filesAreUploading && <Spinner />} {filesAreUploading && <Spinner />}
<Text className="mb-2"> <Text className="mb-2">
Specify files below, click the <b>Upload</b> button, and the contents of Specify files below, click the <b>Upload</b> button, and the contents of
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "} these files will be searchable via Danswer! Currently supported file
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "} types include <i>.txt</i>, <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>,{" "}
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported <i>.xlsx</i>, <i>.csv</i>, <i>.md</i>, <i>.mdx</i>, <i>.conf</i>,{" "}
file types) are supported. <i>.log</i>, <i>.json</i>, <i>.tsv</i>, <i>.xml</i>, <i>.yml</i>,{" "}
<i>.yaml</i>, <i>.eml</i>, <i>.epub</i>, and finally <i>.zip</i> files
(containing supported file types).
</Text> </Text>
<Text className="mb-3"> <Text className="mb-3">
<b>NOTE:</b> if the original document is accessible via a link, you can <b>NOTE:</b> if the original document is accessible via a link, you can