Consolidate File Processing (#1449)

This commit is contained in:
Yuhong Sun 2024-05-11 23:11:22 -07:00 committed by GitHub
parent e89c81de76
commit 546815dc8c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 366 additions and 481 deletions

View File

@ -1,6 +1,4 @@
import os
from datetime import timedelta
from pathlib import Path
from typing import cast
from celery import Celery # type: ignore
@ -10,9 +8,7 @@ from danswer.background.connector_deletion import delete_connector_credential_pa
from danswer.background.task_utils import build_celery_task_wrapper
from danswer.background.task_utils import name_cc_cleanup_task
from danswer.background.task_utils import name_document_set_sync_task
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.connectors.file.utils import file_age_in_hours
from danswer.db.connector_credential_pair import get_connector_credential_pair
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
from danswer.db.document import prepare_to_modify_documents
@ -203,21 +199,6 @@ def check_for_document_sets_sync_task() -> None:
)
@celery_app.task(name="clean_old_temp_files_task", soft_time_limit=JOB_TIMEOUT)
def clean_old_temp_files_task(
age_threshold_in_hours: float | int = 24 * 7, # 1 week,
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
) -> None:
"""Files added via the File connector need to be deleted after ingestion
Currently handled async of the indexing job"""
os.makedirs(base_path, exist_ok=True)
for file in os.listdir(base_path):
full_file_path = Path(base_path) / file
if file_age_in_hours(full_file_path) > age_threshold_in_hours:
logger.info(f"Cleaning up uploaded file: {full_file_path}")
os.remove(full_file_path)
#####
# Celery Beat (Periodic Tasks) Settings
#####

View File

@ -148,10 +148,6 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
# TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
@ -237,10 +233,9 @@ DISABLE_DOCUMENT_CLEANUP = (
#####
# Miscellaneous
#####
DYNAMIC_CONFIG_STORE = (
os.environ.get("DYNAMIC_CONFIG_STORE") or "PostgresBackedDynamicConfigStore"
)
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
# File based Key Value store no longer used
DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore"
JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
# used to allow the background indexing jobs to use a different embedding
# model server than the API server

View File

@ -8,7 +8,6 @@ from pydantic import BaseModel
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
process_in_batches,
)
@ -23,6 +22,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger

View File

@ -7,7 +7,6 @@ from typing import Any
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.bookstack.client import BookStackApiClient
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -16,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
class BookstackConnector(LoadConnector, PollConnector):

View File

@ -19,7 +19,6 @@ from danswer.configs.constants import DocumentSource
from danswer.connectors.confluence.rate_limit_handler import (
make_confluence_call_handle_rate_limit,
)
from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -28,6 +27,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import format_document_soup
from danswer.utils.logger import setup_logger
logger = setup_logger()

View File

@ -1,158 +0,0 @@
import json
import os
import re
import zipfile
from collections.abc import Iterator
from typing import Any
from typing import IO
import chardet
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.utils.logger import setup_logger
logger = setup_logger()
def extract_metadata(line: str) -> dict | None:
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
html_comment_match = re.search(html_comment_pattern, line)
hashtag_match = re.search(hashtag_pattern, line)
if html_comment_match:
json_str = html_comment_match.group(1)
elif hashtag_match:
json_str = hashtag_match.group(1)
else:
return None
try:
return json.loads("{" + json_str + "}")
except json.JSONDecodeError:
return None
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
else:
logger.info(f"No Password available to to decrypt pdf {file_name}")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return "\n".join(page.extract_text() for page in pdf_reader.pages)
except PdfStreamError:
logger.exception(f"PDF file {file_name} is not a valid PDF")
except Exception:
logger.exception(f"Failed to read PDF {file_name}")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"
)
# To include additional metadata in the search index, add a .danswer_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
with zip_file.open(metadata_file_info, "r") as metadata_file:
try:
zip_metadata = json.load(metadata_file)
if isinstance(zip_metadata, list):
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json")
except KeyError:
logger.info("No .danswer_metadata.json file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding
def read_file(
file: IO, encoding: str = "utf-8", errors: str = "replace"
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
line = (
line.decode(encoding, errors=errors)
if isinstance(line, bytes)
else line
)
if ind == 0:
metadata_or_none = extract_metadata(line)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
return file_content_raw, metadata
def is_text_file_extension(file_name: str) -> bool:
extensions = (
".txt",
".mdx",
".md",
".conf",
".log",
".json",
".xml",
".yaml",
".yml",
".json",
)
return any(file_name.endswith(ext) for ext in extensions)

View File

@ -10,7 +10,6 @@ from requests import Response
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput
@ -20,6 +19,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
logger = setup_logger()

View File

@ -8,7 +8,6 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
@ -22,6 +21,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
# Limitations and Potential Improvements
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in

View File

@ -1,36 +1,30 @@
import csv # type: ignore
import io
import os
import zipfile
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from email.parser import Parser as EmailParser
from pathlib import Path
from typing import Any
from typing import IO
import docx2txt # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.file.utils import check_file_ext_is_valid
from danswer.connectors.file.utils import get_file_ext
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.db.engine import get_sqlalchemy_engine
from danswer.file_processing.extract_file_text import check_file_ext_is_valid
from danswer.file_processing.extract_file_text import detect_encoding
from danswer.file_processing.extract_file_text import extract_file_text
from danswer.file_processing.extract_file_text import get_file_ext
from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger
@ -54,18 +48,7 @@ def _read_files_and_metadata(
file_content, ignore_dirs=True
):
yield os.path.join(directory_path, file_info.filename), file, metadata
elif extension in [
".txt",
".md",
".mdx",
".pdf",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]:
elif check_file_ext_is_valid(extension):
yield file_name, file_content, metadata
else:
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@ -84,65 +67,20 @@ def _process_file(
file_metadata: dict[str, Any] = {}
if extension == ".pdf":
file_content_raw = read_pdf_file(
file=file, file_name=file_name, pdf_pass=pdf_pass
if is_text_file_extension(file_name):
encoding = detect_encoding(file)
file_content_raw, file_metadata = read_text_file(file, encoding=encoding)
# Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf":
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
else:
file_content_raw = extract_file_text(
file_name=file_name,
file=file,
)
elif extension == ".docx":
file_content_raw = docx2txt.process(file)
elif extension == ".pptx":
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
file_content_raw = "\n\n".join(text_content)
elif extension == ".xlsx":
workbook = openpyxl.load_workbook(file)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
file_content_raw = "\n\n".join(text_content)
elif extension == ".csv":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
reader = csv.reader(text_file)
file_content_raw = "\n".join([",".join(row) for row in reader])
elif extension == ".eml":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
text_content.append(part.get_payload())
file_content_raw = "\n\n".join(text_content)
elif extension == ".epub":
with zipfile.ZipFile(file) as epub:
text_content = []
for item in epub.infolist():
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
with epub.open(item) as html_file:
soup = BeautifulSoup(html_file, "html.parser")
text_content.append(soup.get_text())
file_content_raw = "\n\n".join(text_content)
else:
encoding = detect_encoding(file)
file_content_raw, file_metadata = read_file(file, encoding=encoding)
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
# If this is set, we will show this in the UI as the "name" of the file

View File

@ -1,66 +0,0 @@
import os
import shutil
import time
import uuid
from pathlib import Path
from typing import Any
from typing import IO
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
_VALID_FILE_EXTENSIONS = [
".txt",
".zip",
".pdf",
".md",
".mdx",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
return extension
def check_file_ext_is_valid(ext: str) -> bool:
return ext in _VALID_FILE_EXTENSIONS
def write_temp_files(
files: list[tuple[str, IO[Any]]],
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
) -> list[str]:
"""Writes temporary files to disk and returns their paths
NOTE: need to pass in (file_name, File) tuples since FastAPI's `UploadFile` class
exposed SpooledTemporaryFile does not include a name.
"""
file_location = Path(base_path) / str(uuid.uuid4())
os.makedirs(file_location, exist_ok=True)
file_paths: list[str] = []
for file_name, file in files:
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
raise ValueError(
f"Invalid file extension for file: '{file_name}'. Must be one of {_VALID_FILE_EXTENSIONS}"
)
file_path = file_location / file_name
with open(file_path, "wb") as buffer:
# copy file content from uploaded file to the newly created file
shutil.copyfileobj(file, buffer)
file_paths.append(str(file_path.absolute()))
return file_paths
def file_age_in_hours(filepath: str | Path) -> float:
return (time.time() - os.path.getmtime(filepath)) / (60 * 60)

View File

@ -1,5 +1,4 @@
import io
import tempfile
from collections.abc import Iterator
from collections.abc import Sequence
from datetime import datetime
@ -9,7 +8,6 @@ from itertools import chain
from typing import Any
from typing import cast
import docx2txt # type:ignore
from google.auth.credentials import Credentials # type: ignore
from googleapiclient import discovery # type: ignore
from googleapiclient.errors import HttpError # type: ignore
@ -21,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.google_drive.connector_auth import (
get_google_drive_creds_for_authorized_user,
@ -42,6 +39,8 @@ from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
@ -321,15 +320,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
)
elif mime_type == GDriveMimeType.WORD_DOC.value:
response = service.files().get_media(fileId=file["id"]).execute()
word_stream = io.BytesIO(response)
with tempfile.NamedTemporaryFile(delete=False) as temp:
temp.write(word_stream.getvalue())
temp_path = temp.name
return docx2txt.process(temp_path)
return docx_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
return file_contents
return pdf_to_text(file=io.BytesIO(response))
return UNSUPPORTED_FILE_TYPE_CONTENT

View File

@ -9,14 +9,14 @@ from sqlalchemy.orm import Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.db.engine import get_sqlalchemy_engine
from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger
@ -86,7 +86,7 @@ class GoogleSitesConnector(LoadConnector):
if extension != ".html":
continue
file_content, _ = read_file(file_io)
file_content, _ = read_text_file(file_io)
soup = BeautifulSoup(file_content, "html.parser")
# get the link out of the navbar

View File

@ -7,7 +7,6 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -17,6 +16,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
# Potential Improvements

View File

@ -9,10 +9,6 @@ from requests_oauthlib import OAuth2Session # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.html_utils import (
strip_excessive_newlines_and_spaces,
)
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -22,6 +18,8 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.file_processing.html_utils import strip_excessive_newlines_and_spaces
from danswer.utils.logger import setup_logger
LOOPIO_API_BASE = "https://api.loopio.com/"

View File

@ -1,22 +1,16 @@
import io
import os
import tempfile
from datetime import datetime
from datetime import timezone
from typing import Any
import docx # type: ignore
import msal # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from office365.graph_client import GraphClient # type: ignore
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
from office365.onedrive.sites.site import Site # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import is_text_file_extension
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -25,6 +19,12 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import file_io_to_text
from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import xlsx_to_text
from danswer.utils.logger import setup_logger
UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things
@ -35,62 +35,28 @@ logger = setup_logger()
def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value
excel_file = io.BytesIO(file_content)
workbook = openpyxl.load_workbook(excel_file, read_only=True)
full_text = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
full_text.append(sheet_string)
return "\n".join(full_text)
return xlsx_to_text(file=io.BytesIO(file_content))
def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value
full_text = []
with tempfile.TemporaryDirectory() as local_path:
with open(os.path.join(local_path, driveitem_object.name), "wb") as local_file:
local_file.write(file_content)
doc = docx.Document(local_file.name)
for para in doc.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
return docx_to_text(file=io.BytesIO(file_content))
def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value
file_text = read_pdf_file(
file=io.BytesIO(file_content), file_name=driveitem_object.name
)
file_text = pdf_to_text(file=io.BytesIO(file_content))
return file_text
def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str:
file_content: bytes = driveitem_object.get_content().execute_query().value
text_string = file_content.decode("utf-8")
return text_string
return file_io_to_text(file=io.BytesIO(file_content))
def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value
pptx_stream = io.BytesIO(file_content)
with tempfile.NamedTemporaryFile() as temp:
temp.write(pptx_stream.getvalue())
presentation = pptx.Presentation(temp.name)
extracted_text = ""
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text += f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
return extracted_text
return pptx_to_text(file=io.BytesIO(file_content))
class SharepointConnector(LoadConnector, PollConnector):

View File

@ -22,12 +22,12 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.utils.logger import setup_logger
logger = setup_logger()
@ -247,9 +247,7 @@ class WebConnector(LoadConnector):
if current_url.split(".")[-1] == "pdf":
# PDF files are not checked for links
response = requests.get(current_url)
page_text = read_pdf_file(
file=io.BytesIO(response.content), file_name=current_url
)
page_text = pdf_to_text(file=io.BytesIO(response.content))
doc_batch.append(
Document(

View File

@ -5,8 +5,9 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
time_str_to_utc,
)
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -14,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
def _article_to_document(article: Article) -> Document:

View File

@ -1,4 +1,3 @@
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE
from danswer.dynamic_configs.interface import DynamicConfigStore
from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore
@ -8,7 +7,7 @@ from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore
def get_dynamic_config_store() -> DynamicConfigStore:
dynamic_config_store_type = DYNAMIC_CONFIG_STORE
if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__:
return FileSystemBackedDynamicConfigStore(DYNAMIC_CONFIG_DIR_PATH)
raise NotImplementedError("File based config store no longer supported")
if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__:
return PostgresBackedDynamicConfigStore()

View File

@ -2,7 +2,6 @@ import json
from pathlib import Path
from typing import cast
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
@ -53,7 +52,7 @@ def insert_into_postgres(store_data: dict) -> None:
config_store.store(port_once_key, True)
def port_filesystem_to_postgres(directory_path: str = DYNAMIC_CONFIG_DIR_PATH) -> None:
def port_filesystem_to_postgres(directory_path: str) -> None:
store_data = read_file_system_store(directory_path)
insert_into_postgres(store_data)

View File

@ -0,0 +1,283 @@
import io
import json
import os
import re
import zipfile
from collections.abc import Iterator
from email.parser import Parser as EmailParser
from pathlib import Path
from typing import Any
from typing import IO
import chardet
import docx # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
logger = setup_logger()
TEXT_SECTION_SEPARATOR = "\n\n"
PLAIN_TEXT_FILE_EXTENSIONS = [
".txt",
".md",
".mdx",
".conf",
".log",
".json",
".csv",
".tsv",
".xml",
".yml",
".yaml",
]
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".pdf",
".docx",
".pptx",
".xlsx",
".eml",
".epub",
]
def is_text_file_extension(file_name: str) -> bool:
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
return extension
def check_file_ext_is_valid(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"
)
# To include additional metadata in the search index, add a .danswer_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
with zip_file.open(metadata_file_info, "r") as metadata_file:
try:
zip_metadata = json.load(metadata_file)
if isinstance(zip_metadata, list):
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json")
except KeyError:
logger.info("No .danswer_metadata.json file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
def _extract_danswer_metadata(line: str) -> dict | None:
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
html_comment_match = re.search(html_comment_pattern, line)
hashtag_match = re.search(hashtag_pattern, line)
if html_comment_match:
json_str = html_comment_match.group(1)
elif hashtag_match:
json_str = hashtag_match.group(1)
else:
return None
try:
return json.loads("{" + json_str + "}")
except json.JSONDecodeError:
return None
def read_text_file(
file: IO,
encoding: str = "utf-8",
errors: str = "replace",
ignore_danswer_metadata: bool = True,
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
line = (
line.decode(encoding, errors=errors)
if isinstance(line, bytes)
else line
)
if ind == 0:
metadata_or_none = (
None if ignore_danswer_metadata else _extract_danswer_metadata(line)
)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
else:
logger.info("No Password available to to decrypt pdf")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
)
except PdfStreamError:
logger.exception("PDF file is not a valid PDF")
except Exception:
logger.exception("Failed to read PDF")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""
def docx_to_text(file: IO[Any]) -> str:
doc = docx.Document(file)
full_text = [para.text for para in doc.paragraphs]
return TEXT_SECTION_SEPARATOR.join(full_text)
def pptx_to_text(file: IO[Any]) -> str:
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
return TEXT_SECTION_SEPARATOR.join(text_content)
def xlsx_to_text(file: IO[Any]) -> str:
workbook = openpyxl.load_workbook(file)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
return TEXT_SECTION_SEPARATOR.join(text_content)
def eml_to_text(file: IO[Any]) -> str:
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
text_content.append(part.get_payload())
return TEXT_SECTION_SEPARATOR.join(text_content)
def epub_to_text(file: IO[Any]) -> str:
with zipfile.ZipFile(file) as epub:
text_content = []
for item in epub.infolist():
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
with epub.open(item) as html_file:
text_content.append(parse_html_page_basic(html_file))
return TEXT_SECTION_SEPARATOR.join(text_content)
def file_io_to_text(file: IO[Any]) -> str:
encoding = detect_encoding(file)
file_content_raw, _ = read_text_file(file, encoding=encoding)
return file_content_raw
def extract_file_text(
file_name: str,
file: IO[Any],
) -> str:
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
raise RuntimeError("Unprocessable file type")
if extension == ".pdf":
return pdf_to_text(file=file)
elif extension == ".docx":
return docx_to_text(file)
elif extension == ".pptx":
return pptx_to_text(file)
elif extension == ".xlsx":
return xlsx_to_text(file)
elif extension == ".eml":
return eml_to_text(file)
elif extension == ".epub":
return epub_to_text(file)
else:
return file_io_to_text(file)

View File

@ -1,6 +1,7 @@
import re
from copy import copy
from dataclasses import dataclass
from typing import IO
import bs4
@ -118,7 +119,7 @@ def format_document_soup(
return strip_excessive_newlines_and_spaces(text)
def parse_html_page_basic(text: str) -> str:
def parse_html_page_basic(text: str | IO[bytes]) -> str:
soup = bs4.BeautifulSoup(text, "html.parser")
return format_document_soup(soup)

View File

@ -46,8 +46,6 @@ from danswer.db.index_attempt import cancel_indexing_attempts_past_model
from danswer.db.index_attempt import expire_index_attempts
from danswer.db.swap_index import check_index_swap
from danswer.document_index.factory import get_default_document_index
from danswer.dynamic_configs.port_configs import port_api_key_to_postgres
from danswer.dynamic_configs.port_configs import port_filesystem_to_postgres
from danswer.search.retrieval.search_runner import download_nltk_data
from danswer.search.search_nlp_models import warm_up_encoders
from danswer.server.auth_check import check_router_auth
@ -162,18 +160,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}"
)
try:
port_filesystem_to_postgres()
except Exception:
logger.debug(
"Skipping port of persistent volumes. Maybe these have already been removed?"
)
try:
port_api_key_to_postgres()
except Exception as e:
logger.debug(f"Failed to port API keys. Exception: {e}. Continuing...")
with Session(engine) as db_session:
check_index_swap(db_session=db_session)
db_embedding_model = get_current_db_embedding_model(db_session)

View File

@ -30,7 +30,6 @@ llama-index==0.9.45
Mako==1.2.4
msal==1.26.0
nltk==3.8.1
docx2txt==0.8
Office365-REST-Python-Client==2.5.4
oauthlib==3.2.2
openai==1.3.5

View File

@ -49,8 +49,6 @@ def run_jobs(exclude_indexing: bool) -> None:
if not exclude_indexing:
update_env = os.environ.copy()
update_env["PYTHONPATH"] = "."
update_env["DYNAMIC_CONFIG_DIR_PATH"] = "./dynamic_config_storage"
update_env["FILE_CONNECTOR_TMP_STORAGE_PATH"] = "./dynamic_config_storage"
cmd_indexing = ["python", "danswer/background/update.py"]
indexing_process = subprocess.Popen(

View File

@ -1,7 +1,7 @@
import pathlib
import unittest
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.file_processing.html_utils import parse_html_page_basic
class TestQAPostprocessing(unittest.TestCase):

View File

@ -81,9 +81,6 @@ services:
# If set to `true` will enable additional logs about Vespa query performance
# (time spent on finding the right docs + time spent fetching summaries from disk)
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -181,9 +178,6 @@ services:
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -229,6 +223,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -256,6 +251,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -323,11 +319,6 @@ services:
volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume:
vespa_volume:
# Created by the container itself

View File

@ -20,6 +20,7 @@ services:
# Auth Settings
- AUTH_TYPE=${AUTH_TYPE:-disabled}
- SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400}
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
- VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-}
- GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-}
- GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-}
@ -46,6 +47,7 @@ services:
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
# if set, allows for the use of the token budget system
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
# Enables the use of bedrock models
@ -79,9 +81,6 @@ services:
# If set to `true` will enable additional logs about Vespa query performance
# (time spent on finding the right docs + time spent fetching summaries from disk)
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -104,6 +103,7 @@ services:
- indexing_model_server
restart: always
environment:
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
# Gen AI Settings (Needed by DanswerBot)
- GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-}
- GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-}
@ -122,6 +122,7 @@ services:
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
# Query Options
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
@ -177,9 +178,6 @@ services:
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -233,6 +231,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -268,6 +267,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -335,11 +335,6 @@ services:
volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume:
vespa_volume:
# Created by the container itself

View File

@ -21,9 +21,6 @@ services:
- POSTGRES_HOST=relational_db
- VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -53,9 +50,6 @@ services:
- VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -107,6 +101,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -134,6 +129,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -205,11 +201,6 @@ services:
volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume:
vespa_volume:
# Created by the container itself

View File

@ -21,9 +21,6 @@ services:
- POSTGRES_HOST=relational_db
- VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -53,9 +50,6 @@ services:
- VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
@ -87,6 +81,8 @@ services:
options:
max-size: "50m"
max-file: "6"
relational_db:
image: postgres:15.2-alpine
restart: always
@ -120,6 +116,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -147,6 +144,7 @@ services:
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
logging:
driver: json-file
@ -222,11 +220,6 @@ services:
volumes:
# local_dynamic_storage is legacy only now
local_dynamic_storage:
# used to store files uploaded by the user temporarily while we are indexing them
# file_connector_tmp_storage is legacy only now
file_connector_tmp_storage:
db_volume:
vespa_volume:
# Created by the container itself

View File

@ -52,10 +52,12 @@ const Main = () => {
{filesAreUploading && <Spinner />}
<Text className="mb-2">
Specify files below, click the <b>Upload</b> button, and the contents of
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
file types) are supported.
these files will be searchable via Danswer! Currently supported file
types include <i>.txt</i>, <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>,{" "}
<i>.xlsx</i>, <i>.csv</i>, <i>.md</i>, <i>.mdx</i>, <i>.conf</i>,{" "}
<i>.log</i>, <i>.json</i>, <i>.tsv</i>, <i>.xml</i>, <i>.yml</i>,{" "}
<i>.yaml</i>, <i>.eml</i>, <i>.epub</i>, and finally <i>.zip</i> files
(containing supported file types).
</Text>
<Text className="mb-3">
<b>NOTE:</b> if the original document is accessible via a link, you can