mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-09 14:11:33 +02:00
Consolidate File Processing (#1449)
This commit is contained in:
@ -1,6 +1,4 @@
|
|||||||
import os
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from pathlib import Path
|
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from celery import Celery # type: ignore
|
from celery import Celery # type: ignore
|
||||||
@ -10,9 +8,7 @@ from danswer.background.connector_deletion import delete_connector_credential_pa
|
|||||||
from danswer.background.task_utils import build_celery_task_wrapper
|
from danswer.background.task_utils import build_celery_task_wrapper
|
||||||
from danswer.background.task_utils import name_cc_cleanup_task
|
from danswer.background.task_utils import name_cc_cleanup_task
|
||||||
from danswer.background.task_utils import name_document_set_sync_task
|
from danswer.background.task_utils import name_document_set_sync_task
|
||||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
|
||||||
from danswer.configs.app_configs import JOB_TIMEOUT
|
from danswer.configs.app_configs import JOB_TIMEOUT
|
||||||
from danswer.connectors.file.utils import file_age_in_hours
|
|
||||||
from danswer.db.connector_credential_pair import get_connector_credential_pair
|
from danswer.db.connector_credential_pair import get_connector_credential_pair
|
||||||
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
|
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
|
||||||
from danswer.db.document import prepare_to_modify_documents
|
from danswer.db.document import prepare_to_modify_documents
|
||||||
@ -203,21 +199,6 @@ def check_for_document_sets_sync_task() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@celery_app.task(name="clean_old_temp_files_task", soft_time_limit=JOB_TIMEOUT)
|
|
||||||
def clean_old_temp_files_task(
|
|
||||||
age_threshold_in_hours: float | int = 24 * 7, # 1 week,
|
|
||||||
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
|
|
||||||
) -> None:
|
|
||||||
"""Files added via the File connector need to be deleted after ingestion
|
|
||||||
Currently handled async of the indexing job"""
|
|
||||||
os.makedirs(base_path, exist_ok=True)
|
|
||||||
for file in os.listdir(base_path):
|
|
||||||
full_file_path = Path(base_path) / file
|
|
||||||
if file_age_in_hours(full_file_path) > age_threshold_in_hours:
|
|
||||||
logger.info(f"Cleaning up uploaded file: {full_file_path}")
|
|
||||||
os.remove(full_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Celery Beat (Periodic Tasks) Settings
|
# Celery Beat (Periodic Tasks) Settings
|
||||||
#####
|
#####
|
||||||
|
@ -148,10 +148,6 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
|
|||||||
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
||||||
GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False
|
GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False
|
||||||
|
|
||||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
|
||||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
||||||
@ -237,10 +233,9 @@ DISABLE_DOCUMENT_CLEANUP = (
|
|||||||
#####
|
#####
|
||||||
# Miscellaneous
|
# Miscellaneous
|
||||||
#####
|
#####
|
||||||
DYNAMIC_CONFIG_STORE = (
|
# File based Key Value store no longer used
|
||||||
os.environ.get("DYNAMIC_CONFIG_STORE") or "PostgresBackedDynamicConfigStore"
|
DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore"
|
||||||
)
|
|
||||||
DYNAMIC_CONFIG_DIR_PATH = os.environ.get("DYNAMIC_CONFIG_DIR_PATH", "/home/storage")
|
|
||||||
JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
|
JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
|
||||||
# used to allow the background indexing jobs to use a different embedding
|
# used to allow the background indexing jobs to use a different embedding
|
||||||
# model server than the API server
|
# model server than the API server
|
||||||
|
@ -8,7 +8,6 @@ from pydantic import BaseModel
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||||
process_in_batches,
|
process_in_batches,
|
||||||
)
|
)
|
||||||
@ -23,6 +22,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@ from typing import Any
|
|||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.bookstack.client import BookStackApiClient
|
from danswer.connectors.bookstack.client import BookStackApiClient
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@ -16,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
|
||||||
|
|
||||||
class BookstackConnector(LoadConnector, PollConnector):
|
class BookstackConnector(LoadConnector, PollConnector):
|
||||||
|
@ -19,7 +19,6 @@ from danswer.configs.constants import DocumentSource
|
|||||||
from danswer.connectors.confluence.rate_limit_handler import (
|
from danswer.connectors.confluence.rate_limit_handler import (
|
||||||
make_confluence_call_handle_rate_limit,
|
make_confluence_call_handle_rate_limit,
|
||||||
)
|
)
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@ -28,6 +27,7 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import format_document_soup
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
@ -1,158 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import zipfile
|
|
||||||
from collections.abc import Iterator
|
|
||||||
from typing import Any
|
|
||||||
from typing import IO
|
|
||||||
|
|
||||||
import chardet
|
|
||||||
from pypdf import PdfReader
|
|
||||||
from pypdf.errors import PdfStreamError
|
|
||||||
|
|
||||||
from danswer.utils.logger import setup_logger
|
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logger()
|
|
||||||
|
|
||||||
|
|
||||||
def extract_metadata(line: str) -> dict | None:
|
|
||||||
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
|
||||||
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
|
||||||
|
|
||||||
html_comment_match = re.search(html_comment_pattern, line)
|
|
||||||
hashtag_match = re.search(hashtag_pattern, line)
|
|
||||||
|
|
||||||
if html_comment_match:
|
|
||||||
json_str = html_comment_match.group(1)
|
|
||||||
elif hashtag_match:
|
|
||||||
json_str = hashtag_match.group(1)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
return json.loads("{" + json_str + "}")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
|
||||||
try:
|
|
||||||
pdf_reader = PdfReader(file)
|
|
||||||
|
|
||||||
# If marked as encrypted and a password is provided, try to decrypt
|
|
||||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
|
||||||
decrypt_success = False
|
|
||||||
if pdf_pass is not None:
|
|
||||||
try:
|
|
||||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
|
||||||
except Exception:
|
|
||||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
|
||||||
else:
|
|
||||||
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
|
||||||
|
|
||||||
if not decrypt_success:
|
|
||||||
# By user request, keep files that are unreadable just so they
|
|
||||||
# can be discoverable by title.
|
|
||||||
return ""
|
|
||||||
|
|
||||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
|
||||||
except PdfStreamError:
|
|
||||||
logger.exception(f"PDF file {file_name} is not a valid PDF")
|
|
||||||
except Exception:
|
|
||||||
logger.exception(f"Failed to read PDF {file_name}")
|
|
||||||
|
|
||||||
# File is still discoverable by title
|
|
||||||
# but the contents are not included as they cannot be parsed
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
|
||||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
|
||||||
"__MACOSX"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# To include additional metadata in the search index, add a .danswer_metadata.json file
|
|
||||||
# to the zip file. This file should contain a list of objects with the following format:
|
|
||||||
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
|
||||||
def load_files_from_zip(
|
|
||||||
zip_file_io: IO,
|
|
||||||
ignore_macos_resource_fork_files: bool = True,
|
|
||||||
ignore_dirs: bool = True,
|
|
||||||
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
|
||||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
|
||||||
zip_metadata = {}
|
|
||||||
try:
|
|
||||||
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
|
|
||||||
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
|
||||||
try:
|
|
||||||
zip_metadata = json.load(metadata_file)
|
|
||||||
if isinstance(zip_metadata, list):
|
|
||||||
# convert list of dicts to dict of dicts
|
|
||||||
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logger.warn("Unable to load .danswer_metadata.json")
|
|
||||||
except KeyError:
|
|
||||||
logger.info("No .danswer_metadata.json file")
|
|
||||||
|
|
||||||
for file_info in zip_file.infolist():
|
|
||||||
with zip_file.open(file_info.filename, "r") as file:
|
|
||||||
if ignore_dirs and file_info.is_dir():
|
|
||||||
continue
|
|
||||||
|
|
||||||
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
|
||||||
file_info.filename
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(file: IO[bytes]) -> str:
|
|
||||||
raw_data = file.read(50000)
|
|
||||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
|
||||||
file.seek(0)
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
|
|
||||||
def read_file(
|
|
||||||
file: IO, encoding: str = "utf-8", errors: str = "replace"
|
|
||||||
) -> tuple[str, dict]:
|
|
||||||
metadata = {}
|
|
||||||
file_content_raw = ""
|
|
||||||
for ind, line in enumerate(file):
|
|
||||||
try:
|
|
||||||
line = line.decode(encoding) if isinstance(line, bytes) else line
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
line = (
|
|
||||||
line.decode(encoding, errors=errors)
|
|
||||||
if isinstance(line, bytes)
|
|
||||||
else line
|
|
||||||
)
|
|
||||||
|
|
||||||
if ind == 0:
|
|
||||||
metadata_or_none = extract_metadata(line)
|
|
||||||
if metadata_or_none is not None:
|
|
||||||
metadata = metadata_or_none
|
|
||||||
else:
|
|
||||||
file_content_raw += line
|
|
||||||
else:
|
|
||||||
file_content_raw += line
|
|
||||||
|
|
||||||
return file_content_raw, metadata
|
|
||||||
|
|
||||||
|
|
||||||
def is_text_file_extension(file_name: str) -> bool:
|
|
||||||
extensions = (
|
|
||||||
".txt",
|
|
||||||
".mdx",
|
|
||||||
".md",
|
|
||||||
".conf",
|
|
||||||
".log",
|
|
||||||
".json",
|
|
||||||
".xml",
|
|
||||||
".yaml",
|
|
||||||
".yml",
|
|
||||||
".json",
|
|
||||||
)
|
|
||||||
return any(file_name.endswith(ext) for ext in extensions)
|
|
@ -10,7 +10,6 @@ from requests import Response
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
@ -20,6 +19,7 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
@ -8,7 +8,6 @@ import requests
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||||
rate_limit_builder,
|
rate_limit_builder,
|
||||||
)
|
)
|
||||||
@ -22,6 +21,7 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
|
||||||
# Limitations and Potential Improvements
|
# Limitations and Potential Improvements
|
||||||
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
|
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
|
||||||
|
@ -1,36 +1,30 @@
|
|||||||
import csv # type: ignore
|
|
||||||
import io
|
|
||||||
import os
|
import os
|
||||||
import zipfile
|
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from email.parser import Parser as EmailParser
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
import docx2txt # type: ignore
|
|
||||||
import openpyxl # type: ignore
|
|
||||||
import pptx # type: ignore
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
|
||||||
from danswer.connectors.file.utils import get_file_ext
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import BasicExpertInfo
|
from danswer.connectors.models import BasicExpertInfo
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.db.engine import get_sqlalchemy_engine
|
from danswer.db.engine import get_sqlalchemy_engine
|
||||||
|
from danswer.file_processing.extract_file_text import check_file_ext_is_valid
|
||||||
|
from danswer.file_processing.extract_file_text import detect_encoding
|
||||||
|
from danswer.file_processing.extract_file_text import extract_file_text
|
||||||
|
from danswer.file_processing.extract_file_text import get_file_ext
|
||||||
|
from danswer.file_processing.extract_file_text import is_text_file_extension
|
||||||
|
from danswer.file_processing.extract_file_text import load_files_from_zip
|
||||||
|
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import read_text_file
|
||||||
from danswer.file_store.file_store import get_default_file_store
|
from danswer.file_store.file_store import get_default_file_store
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@ -54,18 +48,7 @@ def _read_files_and_metadata(
|
|||||||
file_content, ignore_dirs=True
|
file_content, ignore_dirs=True
|
||||||
):
|
):
|
||||||
yield os.path.join(directory_path, file_info.filename), file, metadata
|
yield os.path.join(directory_path, file_info.filename), file, metadata
|
||||||
elif extension in [
|
elif check_file_ext_is_valid(extension):
|
||||||
".txt",
|
|
||||||
".md",
|
|
||||||
".mdx",
|
|
||||||
".pdf",
|
|
||||||
".docx",
|
|
||||||
".pptx",
|
|
||||||
".xlsx",
|
|
||||||
".csv",
|
|
||||||
".eml",
|
|
||||||
".epub",
|
|
||||||
]:
|
|
||||||
yield file_name, file_content, metadata
|
yield file_name, file_content, metadata
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||||
@ -84,65 +67,20 @@ def _process_file(
|
|||||||
|
|
||||||
file_metadata: dict[str, Any] = {}
|
file_metadata: dict[str, Any] = {}
|
||||||
|
|
||||||
if extension == ".pdf":
|
if is_text_file_extension(file_name):
|
||||||
file_content_raw = read_pdf_file(
|
encoding = detect_encoding(file)
|
||||||
file=file, file_name=file_name, pdf_pass=pdf_pass
|
file_content_raw, file_metadata = read_text_file(file, encoding=encoding)
|
||||||
|
|
||||||
|
# Using the PDF reader function directly to pass in password cleanly
|
||||||
|
elif extension == ".pdf":
|
||||||
|
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
|
||||||
|
|
||||||
|
else:
|
||||||
|
file_content_raw = extract_file_text(
|
||||||
|
file_name=file_name,
|
||||||
|
file=file,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif extension == ".docx":
|
|
||||||
file_content_raw = docx2txt.process(file)
|
|
||||||
|
|
||||||
elif extension == ".pptx":
|
|
||||||
presentation = pptx.Presentation(file)
|
|
||||||
text_content = []
|
|
||||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
||||||
extracted_text = f"\nSlide {slide_number}:\n"
|
|
||||||
for shape in slide.shapes:
|
|
||||||
if hasattr(shape, "text"):
|
|
||||||
extracted_text += shape.text + "\n"
|
|
||||||
|
|
||||||
text_content.append(extracted_text)
|
|
||||||
file_content_raw = "\n\n".join(text_content)
|
|
||||||
|
|
||||||
elif extension == ".xlsx":
|
|
||||||
workbook = openpyxl.load_workbook(file)
|
|
||||||
text_content = []
|
|
||||||
for sheet in workbook.worksheets:
|
|
||||||
sheet_string = "\n".join(
|
|
||||||
",".join(map(str, row))
|
|
||||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
|
||||||
)
|
|
||||||
text_content.append(sheet_string)
|
|
||||||
file_content_raw = "\n\n".join(text_content)
|
|
||||||
|
|
||||||
elif extension == ".csv":
|
|
||||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
|
||||||
reader = csv.reader(text_file)
|
|
||||||
file_content_raw = "\n".join([",".join(row) for row in reader])
|
|
||||||
|
|
||||||
elif extension == ".eml":
|
|
||||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
|
||||||
parser = EmailParser()
|
|
||||||
message = parser.parse(text_file)
|
|
||||||
|
|
||||||
text_content = []
|
|
||||||
for part in message.walk():
|
|
||||||
if part.get_content_type().startswith("text/plain"):
|
|
||||||
text_content.append(part.get_payload())
|
|
||||||
file_content_raw = "\n\n".join(text_content)
|
|
||||||
|
|
||||||
elif extension == ".epub":
|
|
||||||
with zipfile.ZipFile(file) as epub:
|
|
||||||
text_content = []
|
|
||||||
for item in epub.infolist():
|
|
||||||
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
|
||||||
with epub.open(item) as html_file:
|
|
||||||
soup = BeautifulSoup(html_file, "html.parser")
|
|
||||||
text_content.append(soup.get_text())
|
|
||||||
file_content_raw = "\n\n".join(text_content)
|
|
||||||
else:
|
|
||||||
encoding = detect_encoding(file)
|
|
||||||
file_content_raw, file_metadata = read_file(file, encoding=encoding)
|
|
||||||
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
||||||
|
|
||||||
# If this is set, we will show this in the UI as the "name" of the file
|
# If this is set, we will show this in the UI as the "name" of the file
|
||||||
|
@ -1,66 +0,0 @@
|
|||||||
import os
|
|
||||||
import shutil
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
from typing import IO
|
|
||||||
|
|
||||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
|
||||||
|
|
||||||
_VALID_FILE_EXTENSIONS = [
|
|
||||||
".txt",
|
|
||||||
".zip",
|
|
||||||
".pdf",
|
|
||||||
".md",
|
|
||||||
".mdx",
|
|
||||||
".docx",
|
|
||||||
".pptx",
|
|
||||||
".xlsx",
|
|
||||||
".csv",
|
|
||||||
".eml",
|
|
||||||
".epub",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
|
||||||
_, extension = os.path.splitext(file_path_or_name)
|
|
||||||
return extension
|
|
||||||
|
|
||||||
|
|
||||||
def check_file_ext_is_valid(ext: str) -> bool:
|
|
||||||
return ext in _VALID_FILE_EXTENSIONS
|
|
||||||
|
|
||||||
|
|
||||||
def write_temp_files(
|
|
||||||
files: list[tuple[str, IO[Any]]],
|
|
||||||
base_path: Path | str = FILE_CONNECTOR_TMP_STORAGE_PATH,
|
|
||||||
) -> list[str]:
|
|
||||||
"""Writes temporary files to disk and returns their paths
|
|
||||||
|
|
||||||
NOTE: need to pass in (file_name, File) tuples since FastAPI's `UploadFile` class
|
|
||||||
exposed SpooledTemporaryFile does not include a name.
|
|
||||||
"""
|
|
||||||
file_location = Path(base_path) / str(uuid.uuid4())
|
|
||||||
os.makedirs(file_location, exist_ok=True)
|
|
||||||
|
|
||||||
file_paths: list[str] = []
|
|
||||||
for file_name, file in files:
|
|
||||||
extension = get_file_ext(file_name)
|
|
||||||
if not check_file_ext_is_valid(extension):
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid file extension for file: '{file_name}'. Must be one of {_VALID_FILE_EXTENSIONS}"
|
|
||||||
)
|
|
||||||
|
|
||||||
file_path = file_location / file_name
|
|
||||||
with open(file_path, "wb") as buffer:
|
|
||||||
# copy file content from uploaded file to the newly created file
|
|
||||||
shutil.copyfileobj(file, buffer)
|
|
||||||
|
|
||||||
file_paths.append(str(file_path.absolute()))
|
|
||||||
|
|
||||||
return file_paths
|
|
||||||
|
|
||||||
|
|
||||||
def file_age_in_hours(filepath: str | Path) -> float:
|
|
||||||
return (time.time() - os.path.getmtime(filepath)) / (60 * 60)
|
|
@ -1,5 +1,4 @@
|
|||||||
import io
|
import io
|
||||||
import tempfile
|
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -9,7 +8,6 @@ from itertools import chain
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
import docx2txt # type:ignore
|
|
||||||
from google.auth.credentials import Credentials # type: ignore
|
from google.auth.credentials import Credentials # type: ignore
|
||||||
from googleapiclient import discovery # type: ignore
|
from googleapiclient import discovery # type: ignore
|
||||||
from googleapiclient.errors import HttpError # type: ignore
|
from googleapiclient.errors import HttpError # type: ignore
|
||||||
@ -21,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
|
|||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.configs.constants import IGNORE_FOR_QA
|
from danswer.configs.constants import IGNORE_FOR_QA
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
|
||||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||||
from danswer.connectors.google_drive.connector_auth import (
|
from danswer.connectors.google_drive.connector_auth import (
|
||||||
get_google_drive_creds_for_authorized_user,
|
get_google_drive_creds_for_authorized_user,
|
||||||
@ -42,6 +39,8 @@ from danswer.connectors.interfaces import PollConnector
|
|||||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.extract_file_text import docx_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||||
from danswer.utils.batching import batch_generator
|
from danswer.utils.batching import batch_generator
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@ -321,15 +320,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
|||||||
)
|
)
|
||||||
elif mime_type == GDriveMimeType.WORD_DOC.value:
|
elif mime_type == GDriveMimeType.WORD_DOC.value:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
word_stream = io.BytesIO(response)
|
return docx_to_text(file=io.BytesIO(response))
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
||||||
temp.write(word_stream.getvalue())
|
|
||||||
temp_path = temp.name
|
|
||||||
return docx2txt.process(temp_path)
|
|
||||||
elif mime_type == GDriveMimeType.PDF.value:
|
elif mime_type == GDriveMimeType.PDF.value:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
|
return pdf_to_text(file=io.BytesIO(response))
|
||||||
return file_contents
|
|
||||||
|
|
||||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||||
|
|
||||||
|
@ -9,14 +9,14 @@ from sqlalchemy.orm import Session
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.db.engine import get_sqlalchemy_engine
|
from danswer.db.engine import get_sqlalchemy_engine
|
||||||
|
from danswer.file_processing.extract_file_text import load_files_from_zip
|
||||||
|
from danswer.file_processing.extract_file_text import read_text_file
|
||||||
|
from danswer.file_processing.html_utils import web_html_cleanup
|
||||||
from danswer.file_store.file_store import get_default_file_store
|
from danswer.file_store.file_store import get_default_file_store
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ class GoogleSitesConnector(LoadConnector):
|
|||||||
if extension != ".html":
|
if extension != ".html":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_content, _ = read_file(file_io)
|
file_content, _ = read_text_file(file_io)
|
||||||
soup = BeautifulSoup(file_content, "html.parser")
|
soup = BeautifulSoup(file_content, "html.parser")
|
||||||
|
|
||||||
# get the link out of the navbar
|
# get the link out of the navbar
|
||||||
|
@ -7,7 +7,6 @@ import requests
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@ -17,6 +16,7 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
# Potential Improvements
|
# Potential Improvements
|
||||||
|
@ -9,10 +9,6 @@ from requests_oauthlib import OAuth2Session # type: ignore
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import (
|
|
||||||
strip_excessive_newlines_and_spaces,
|
|
||||||
)
|
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@ -22,6 +18,8 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
from danswer.file_processing.html_utils import strip_excessive_newlines_and_spaces
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
LOOPIO_API_BASE = "https://api.loopio.com/"
|
LOOPIO_API_BASE = "https://api.loopio.com/"
|
||||||
|
@ -1,22 +1,16 @@
|
|||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import tempfile
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import docx # type: ignore
|
|
||||||
import msal # type: ignore
|
import msal # type: ignore
|
||||||
import openpyxl # type: ignore
|
|
||||||
import pptx # type: ignore
|
|
||||||
from office365.graph_client import GraphClient # type: ignore
|
from office365.graph_client import GraphClient # type: ignore
|
||||||
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
|
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
|
||||||
from office365.onedrive.sites.site import Site # type: ignore
|
from office365.onedrive.sites.site import Site # type: ignore
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import is_text_file_extension
|
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@ -25,6 +19,12 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.extract_file_text import docx_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import file_io_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import is_text_file_extension
|
||||||
|
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import pptx_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import xlsx_to_text
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things
|
UNSUPPORTED_FILE_TYPE_CONTENT = "" # idea copied from the google drive side of things
|
||||||
@ -35,62 +35,28 @@ logger = setup_logger()
|
|||||||
|
|
||||||
def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str:
|
def get_text_from_xlsx_driveitem(driveitem_object: DriveItem) -> str:
|
||||||
file_content = driveitem_object.get_content().execute_query().value
|
file_content = driveitem_object.get_content().execute_query().value
|
||||||
excel_file = io.BytesIO(file_content)
|
return xlsx_to_text(file=io.BytesIO(file_content))
|
||||||
workbook = openpyxl.load_workbook(excel_file, read_only=True)
|
|
||||||
|
|
||||||
full_text = []
|
|
||||||
for sheet in workbook.worksheets:
|
|
||||||
sheet_string = "\n".join(
|
|
||||||
",".join(map(str, row))
|
|
||||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
|
||||||
)
|
|
||||||
full_text.append(sheet_string)
|
|
||||||
|
|
||||||
return "\n".join(full_text)
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str:
|
def get_text_from_docx_driveitem(driveitem_object: DriveItem) -> str:
|
||||||
file_content = driveitem_object.get_content().execute_query().value
|
file_content = driveitem_object.get_content().execute_query().value
|
||||||
full_text = []
|
return docx_to_text(file=io.BytesIO(file_content))
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as local_path:
|
|
||||||
with open(os.path.join(local_path, driveitem_object.name), "wb") as local_file:
|
|
||||||
local_file.write(file_content)
|
|
||||||
doc = docx.Document(local_file.name)
|
|
||||||
for para in doc.paragraphs:
|
|
||||||
full_text.append(para.text)
|
|
||||||
return "\n".join(full_text)
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str:
|
def get_text_from_pdf_driveitem(driveitem_object: DriveItem) -> str:
|
||||||
file_content = driveitem_object.get_content().execute_query().value
|
file_content = driveitem_object.get_content().execute_query().value
|
||||||
file_text = read_pdf_file(
|
file_text = pdf_to_text(file=io.BytesIO(file_content))
|
||||||
file=io.BytesIO(file_content), file_name=driveitem_object.name
|
|
||||||
)
|
|
||||||
return file_text
|
return file_text
|
||||||
|
|
||||||
|
|
||||||
def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str:
|
def get_text_from_txt_driveitem(driveitem_object: DriveItem) -> str:
|
||||||
file_content: bytes = driveitem_object.get_content().execute_query().value
|
file_content: bytes = driveitem_object.get_content().execute_query().value
|
||||||
text_string = file_content.decode("utf-8")
|
return file_io_to_text(file=io.BytesIO(file_content))
|
||||||
return text_string
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
|
def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
|
||||||
file_content = driveitem_object.get_content().execute_query().value
|
file_content = driveitem_object.get_content().execute_query().value
|
||||||
pptx_stream = io.BytesIO(file_content)
|
return pptx_to_text(file=io.BytesIO(file_content))
|
||||||
with tempfile.NamedTemporaryFile() as temp:
|
|
||||||
temp.write(pptx_stream.getvalue())
|
|
||||||
presentation = pptx.Presentation(temp.name)
|
|
||||||
extracted_text = ""
|
|
||||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
||||||
extracted_text += f"\nSlide {slide_number}:\n"
|
|
||||||
|
|
||||||
for shape in slide.shapes:
|
|
||||||
if hasattr(shape, "text"):
|
|
||||||
extracted_text += shape.text + "\n"
|
|
||||||
|
|
||||||
return extracted_text
|
|
||||||
|
|
||||||
|
|
||||||
class SharepointConnector(LoadConnector, PollConnector):
|
class SharepointConnector(LoadConnector, PollConnector):
|
||||||
|
@ -22,12 +22,12 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
|||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
|
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||||
|
from danswer.file_processing.html_utils import web_html_cleanup
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
@ -247,9 +247,7 @@ class WebConnector(LoadConnector):
|
|||||||
if current_url.split(".")[-1] == "pdf":
|
if current_url.split(".")[-1] == "pdf":
|
||||||
# PDF files are not checked for links
|
# PDF files are not checked for links
|
||||||
response = requests.get(current_url)
|
response = requests.get(current_url)
|
||||||
page_text = read_pdf_file(
|
page_text = pdf_to_text(file=io.BytesIO(response.content))
|
||||||
file=io.BytesIO(response.content), file_name=current_url
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
@ -5,8 +5,9 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
time_str_to_utc,
|
||||||
|
)
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@ -14,6 +15,7 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import BasicExpertInfo
|
from danswer.connectors.models import BasicExpertInfo
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
|
||||||
|
|
||||||
def _article_to_document(article: Article) -> Document:
|
def _article_to_document(article: Article) -> Document:
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
|
|
||||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE
|
from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE
|
||||||
from danswer.dynamic_configs.interface import DynamicConfigStore
|
from danswer.dynamic_configs.interface import DynamicConfigStore
|
||||||
from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore
|
from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore
|
||||||
@ -8,7 +7,7 @@ from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore
|
|||||||
def get_dynamic_config_store() -> DynamicConfigStore:
|
def get_dynamic_config_store() -> DynamicConfigStore:
|
||||||
dynamic_config_store_type = DYNAMIC_CONFIG_STORE
|
dynamic_config_store_type = DYNAMIC_CONFIG_STORE
|
||||||
if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__:
|
if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__:
|
||||||
return FileSystemBackedDynamicConfigStore(DYNAMIC_CONFIG_DIR_PATH)
|
raise NotImplementedError("File based config store no longer supported")
|
||||||
if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__:
|
if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__:
|
||||||
return PostgresBackedDynamicConfigStore()
|
return PostgresBackedDynamicConfigStore()
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ import json
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from danswer.configs.app_configs import DYNAMIC_CONFIG_DIR_PATH
|
|
||||||
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
|
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
|
||||||
from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
|
from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
|
||||||
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
|
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
|
||||||
@ -53,7 +52,7 @@ def insert_into_postgres(store_data: dict) -> None:
|
|||||||
config_store.store(port_once_key, True)
|
config_store.store(port_once_key, True)
|
||||||
|
|
||||||
|
|
||||||
def port_filesystem_to_postgres(directory_path: str = DYNAMIC_CONFIG_DIR_PATH) -> None:
|
def port_filesystem_to_postgres(directory_path: str) -> None:
|
||||||
store_data = read_file_system_store(directory_path)
|
store_data = read_file_system_store(directory_path)
|
||||||
insert_into_postgres(store_data)
|
insert_into_postgres(store_data)
|
||||||
|
|
||||||
|
0
backend/danswer/file_processing/__init__.py
Normal file
0
backend/danswer/file_processing/__init__.py
Normal file
283
backend/danswer/file_processing/extract_file_text.py
Normal file
283
backend/danswer/file_processing/extract_file_text.py
Normal file
@ -0,0 +1,283 @@
|
|||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import zipfile
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from email.parser import Parser as EmailParser
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from typing import IO
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
import docx # type: ignore
|
||||||
|
import openpyxl # type: ignore
|
||||||
|
import pptx # type: ignore
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pypdf.errors import PdfStreamError
|
||||||
|
|
||||||
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
TEXT_SECTION_SEPARATOR = "\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
PLAIN_TEXT_FILE_EXTENSIONS = [
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".mdx",
|
||||||
|
".conf",
|
||||||
|
".log",
|
||||||
|
".json",
|
||||||
|
".csv",
|
||||||
|
".tsv",
|
||||||
|
".xml",
|
||||||
|
".yml",
|
||||||
|
".yaml",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".eml",
|
||||||
|
".epub",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_file_extension(file_name: str) -> bool:
|
||||||
|
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||||
|
_, extension = os.path.splitext(file_path_or_name)
|
||||||
|
return extension
|
||||||
|
|
||||||
|
|
||||||
|
def check_file_ext_is_valid(ext: str) -> bool:
|
||||||
|
return ext in VALID_FILE_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def detect_encoding(file: IO[bytes]) -> str:
|
||||||
|
raw_data = file.read(50000)
|
||||||
|
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||||
|
file.seek(0)
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||||
|
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||||
|
"__MACOSX"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# To include additional metadata in the search index, add a .danswer_metadata.json file
|
||||||
|
# to the zip file. This file should contain a list of objects with the following format:
|
||||||
|
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
||||||
|
def load_files_from_zip(
|
||||||
|
zip_file_io: IO,
|
||||||
|
ignore_macos_resource_fork_files: bool = True,
|
||||||
|
ignore_dirs: bool = True,
|
||||||
|
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
||||||
|
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||||
|
zip_metadata = {}
|
||||||
|
try:
|
||||||
|
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
|
||||||
|
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
||||||
|
try:
|
||||||
|
zip_metadata = json.load(metadata_file)
|
||||||
|
if isinstance(zip_metadata, list):
|
||||||
|
# convert list of dicts to dict of dicts
|
||||||
|
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warn("Unable to load .danswer_metadata.json")
|
||||||
|
except KeyError:
|
||||||
|
logger.info("No .danswer_metadata.json file")
|
||||||
|
|
||||||
|
for file_info in zip_file.infolist():
|
||||||
|
with zip_file.open(file_info.filename, "r") as file:
|
||||||
|
if ignore_dirs and file_info.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
||||||
|
file_info.filename
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_danswer_metadata(line: str) -> dict | None:
|
||||||
|
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
||||||
|
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
||||||
|
|
||||||
|
html_comment_match = re.search(html_comment_pattern, line)
|
||||||
|
hashtag_match = re.search(hashtag_pattern, line)
|
||||||
|
|
||||||
|
if html_comment_match:
|
||||||
|
json_str = html_comment_match.group(1)
|
||||||
|
elif hashtag_match:
|
||||||
|
json_str = hashtag_match.group(1)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads("{" + json_str + "}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def read_text_file(
|
||||||
|
file: IO,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
errors: str = "replace",
|
||||||
|
ignore_danswer_metadata: bool = True,
|
||||||
|
) -> tuple[str, dict]:
|
||||||
|
metadata = {}
|
||||||
|
file_content_raw = ""
|
||||||
|
for ind, line in enumerate(file):
|
||||||
|
try:
|
||||||
|
line = line.decode(encoding) if isinstance(line, bytes) else line
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
line = (
|
||||||
|
line.decode(encoding, errors=errors)
|
||||||
|
if isinstance(line, bytes)
|
||||||
|
else line
|
||||||
|
)
|
||||||
|
|
||||||
|
if ind == 0:
|
||||||
|
metadata_or_none = (
|
||||||
|
None if ignore_danswer_metadata else _extract_danswer_metadata(line)
|
||||||
|
)
|
||||||
|
if metadata_or_none is not None:
|
||||||
|
metadata = metadata_or_none
|
||||||
|
else:
|
||||||
|
file_content_raw += line
|
||||||
|
else:
|
||||||
|
file_content_raw += line
|
||||||
|
|
||||||
|
return file_content_raw, metadata
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||||
|
try:
|
||||||
|
pdf_reader = PdfReader(file)
|
||||||
|
|
||||||
|
# If marked as encrypted and a password is provided, try to decrypt
|
||||||
|
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||||
|
decrypt_success = False
|
||||||
|
if pdf_pass is not None:
|
||||||
|
try:
|
||||||
|
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||||
|
except Exception:
|
||||||
|
logger.error("Unable to decrypt pdf")
|
||||||
|
else:
|
||||||
|
logger.info("No Password available to to decrypt pdf")
|
||||||
|
|
||||||
|
if not decrypt_success:
|
||||||
|
# By user request, keep files that are unreadable just so they
|
||||||
|
# can be discoverable by title.
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(
|
||||||
|
page.extract_text() for page in pdf_reader.pages
|
||||||
|
)
|
||||||
|
except PdfStreamError:
|
||||||
|
logger.exception("PDF file is not a valid PDF")
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to read PDF")
|
||||||
|
|
||||||
|
# File is still discoverable by title
|
||||||
|
# but the contents are not included as they cannot be parsed
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def docx_to_text(file: IO[Any]) -> str:
|
||||||
|
doc = docx.Document(file)
|
||||||
|
full_text = [para.text for para in doc.paragraphs]
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(full_text)
|
||||||
|
|
||||||
|
|
||||||
|
def pptx_to_text(file: IO[Any]) -> str:
|
||||||
|
presentation = pptx.Presentation(file)
|
||||||
|
text_content = []
|
||||||
|
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||||
|
extracted_text = f"\nSlide {slide_number}:\n"
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text"):
|
||||||
|
extracted_text += shape.text + "\n"
|
||||||
|
text_content.append(extracted_text)
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||||
|
|
||||||
|
|
||||||
|
def xlsx_to_text(file: IO[Any]) -> str:
|
||||||
|
workbook = openpyxl.load_workbook(file)
|
||||||
|
text_content = []
|
||||||
|
for sheet in workbook.worksheets:
|
||||||
|
sheet_string = "\n".join(
|
||||||
|
",".join(map(str, row))
|
||||||
|
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||||
|
)
|
||||||
|
text_content.append(sheet_string)
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||||
|
|
||||||
|
|
||||||
|
def eml_to_text(file: IO[Any]) -> str:
|
||||||
|
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||||
|
parser = EmailParser()
|
||||||
|
message = parser.parse(text_file)
|
||||||
|
text_content = []
|
||||||
|
for part in message.walk():
|
||||||
|
if part.get_content_type().startswith("text/plain"):
|
||||||
|
text_content.append(part.get_payload())
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||||
|
|
||||||
|
|
||||||
|
def epub_to_text(file: IO[Any]) -> str:
|
||||||
|
with zipfile.ZipFile(file) as epub:
|
||||||
|
text_content = []
|
||||||
|
for item in epub.infolist():
|
||||||
|
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
||||||
|
with epub.open(item) as html_file:
|
||||||
|
text_content.append(parse_html_page_basic(html_file))
|
||||||
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||||
|
|
||||||
|
|
||||||
|
def file_io_to_text(file: IO[Any]) -> str:
|
||||||
|
encoding = detect_encoding(file)
|
||||||
|
file_content_raw, _ = read_text_file(file, encoding=encoding)
|
||||||
|
return file_content_raw
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_text(
|
||||||
|
file_name: str,
|
||||||
|
file: IO[Any],
|
||||||
|
) -> str:
|
||||||
|
extension = get_file_ext(file_name)
|
||||||
|
if not check_file_ext_is_valid(extension):
|
||||||
|
raise RuntimeError("Unprocessable file type")
|
||||||
|
|
||||||
|
if extension == ".pdf":
|
||||||
|
return pdf_to_text(file=file)
|
||||||
|
|
||||||
|
elif extension == ".docx":
|
||||||
|
return docx_to_text(file)
|
||||||
|
|
||||||
|
elif extension == ".pptx":
|
||||||
|
return pptx_to_text(file)
|
||||||
|
|
||||||
|
elif extension == ".xlsx":
|
||||||
|
return xlsx_to_text(file)
|
||||||
|
|
||||||
|
elif extension == ".eml":
|
||||||
|
return eml_to_text(file)
|
||||||
|
|
||||||
|
elif extension == ".epub":
|
||||||
|
return epub_to_text(file)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return file_io_to_text(file)
|
@ -1,6 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import IO
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
@ -118,7 +119,7 @@ def format_document_soup(
|
|||||||
return strip_excessive_newlines_and_spaces(text)
|
return strip_excessive_newlines_and_spaces(text)
|
||||||
|
|
||||||
|
|
||||||
def parse_html_page_basic(text: str) -> str:
|
def parse_html_page_basic(text: str | IO[bytes]) -> str:
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
return format_document_soup(soup)
|
return format_document_soup(soup)
|
||||||
|
|
@ -46,8 +46,6 @@ from danswer.db.index_attempt import cancel_indexing_attempts_past_model
|
|||||||
from danswer.db.index_attempt import expire_index_attempts
|
from danswer.db.index_attempt import expire_index_attempts
|
||||||
from danswer.db.swap_index import check_index_swap
|
from danswer.db.swap_index import check_index_swap
|
||||||
from danswer.document_index.factory import get_default_document_index
|
from danswer.document_index.factory import get_default_document_index
|
||||||
from danswer.dynamic_configs.port_configs import port_api_key_to_postgres
|
|
||||||
from danswer.dynamic_configs.port_configs import port_filesystem_to_postgres
|
|
||||||
from danswer.search.retrieval.search_runner import download_nltk_data
|
from danswer.search.retrieval.search_runner import download_nltk_data
|
||||||
from danswer.search.search_nlp_models import warm_up_encoders
|
from danswer.search.search_nlp_models import warm_up_encoders
|
||||||
from danswer.server.auth_check import check_router_auth
|
from danswer.server.auth_check import check_router_auth
|
||||||
@ -162,18 +160,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
|||||||
f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}"
|
f"Using multilingual flow with languages: {MULTILINGUAL_QUERY_EXPANSION}"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
port_filesystem_to_postgres()
|
|
||||||
except Exception:
|
|
||||||
logger.debug(
|
|
||||||
"Skipping port of persistent volumes. Maybe these have already been removed?"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
port_api_key_to_postgres()
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Failed to port API keys. Exception: {e}. Continuing...")
|
|
||||||
|
|
||||||
with Session(engine) as db_session:
|
with Session(engine) as db_session:
|
||||||
check_index_swap(db_session=db_session)
|
check_index_swap(db_session=db_session)
|
||||||
db_embedding_model = get_current_db_embedding_model(db_session)
|
db_embedding_model = get_current_db_embedding_model(db_session)
|
||||||
|
@ -30,7 +30,6 @@ llama-index==0.9.45
|
|||||||
Mako==1.2.4
|
Mako==1.2.4
|
||||||
msal==1.26.0
|
msal==1.26.0
|
||||||
nltk==3.8.1
|
nltk==3.8.1
|
||||||
docx2txt==0.8
|
|
||||||
Office365-REST-Python-Client==2.5.4
|
Office365-REST-Python-Client==2.5.4
|
||||||
oauthlib==3.2.2
|
oauthlib==3.2.2
|
||||||
openai==1.3.5
|
openai==1.3.5
|
||||||
|
@ -49,8 +49,6 @@ def run_jobs(exclude_indexing: bool) -> None:
|
|||||||
if not exclude_indexing:
|
if not exclude_indexing:
|
||||||
update_env = os.environ.copy()
|
update_env = os.environ.copy()
|
||||||
update_env["PYTHONPATH"] = "."
|
update_env["PYTHONPATH"] = "."
|
||||||
update_env["DYNAMIC_CONFIG_DIR_PATH"] = "./dynamic_config_storage"
|
|
||||||
update_env["FILE_CONNECTOR_TMP_STORAGE_PATH"] = "./dynamic_config_storage"
|
|
||||||
cmd_indexing = ["python", "danswer/background/update.py"]
|
cmd_indexing = ["python", "danswer/background/update.py"]
|
||||||
|
|
||||||
indexing_process = subprocess.Popen(
|
indexing_process = subprocess.Popen(
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import pathlib
|
import pathlib
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
|
|
||||||
|
|
||||||
class TestQAPostprocessing(unittest.TestCase):
|
class TestQAPostprocessing(unittest.TestCase):
|
||||||
|
@ -81,9 +81,6 @@ services:
|
|||||||
# If set to `true` will enable additional logs about Vespa query performance
|
# If set to `true` will enable additional logs about Vespa query performance
|
||||||
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
||||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -181,9 +178,6 @@ services:
|
|||||||
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
||||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
||||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -229,6 +223,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -256,6 +251,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -323,11 +319,6 @@ services:
|
|||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
# local_dynamic_storage is legacy only now
|
|
||||||
local_dynamic_storage:
|
|
||||||
# used to store files uploaded by the user temporarily while we are indexing them
|
|
||||||
# file_connector_tmp_storage is legacy only now
|
|
||||||
file_connector_tmp_storage:
|
|
||||||
db_volume:
|
db_volume:
|
||||||
vespa_volume:
|
vespa_volume:
|
||||||
# Created by the container itself
|
# Created by the container itself
|
||||||
|
@ -20,6 +20,7 @@ services:
|
|||||||
# Auth Settings
|
# Auth Settings
|
||||||
- AUTH_TYPE=${AUTH_TYPE:-disabled}
|
- AUTH_TYPE=${AUTH_TYPE:-disabled}
|
||||||
- SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400}
|
- SESSION_EXPIRE_TIME_SECONDS=${SESSION_EXPIRE_TIME_SECONDS:-86400}
|
||||||
|
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
|
||||||
- VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-}
|
- VALID_EMAIL_DOMAINS=${VALID_EMAIL_DOMAINS:-}
|
||||||
- GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-}
|
- GOOGLE_OAUTH_CLIENT_ID=${GOOGLE_OAUTH_CLIENT_ID:-}
|
||||||
- GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-}
|
- GOOGLE_OAUTH_CLIENT_SECRET=${GOOGLE_OAUTH_CLIENT_SECRET:-}
|
||||||
@ -46,6 +47,7 @@ services:
|
|||||||
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
|
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
|
||||||
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
||||||
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
||||||
|
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
|
||||||
# if set, allows for the use of the token budget system
|
# if set, allows for the use of the token budget system
|
||||||
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
|
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
|
||||||
# Enables the use of bedrock models
|
# Enables the use of bedrock models
|
||||||
@ -79,9 +81,6 @@ services:
|
|||||||
# If set to `true` will enable additional logs about Vespa query performance
|
# If set to `true` will enable additional logs about Vespa query performance
|
||||||
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
||||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -104,6 +103,7 @@ services:
|
|||||||
- indexing_model_server
|
- indexing_model_server
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
|
- ENCRYPTION_KEY_SECRET=${ENCRYPTION_KEY_SECRET:-}
|
||||||
# Gen AI Settings (Needed by DanswerBot)
|
# Gen AI Settings (Needed by DanswerBot)
|
||||||
- GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-}
|
- GEN_AI_MODEL_PROVIDER=${GEN_AI_MODEL_PROVIDER:-}
|
||||||
- GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-}
|
- GEN_AI_MODEL_VERSION=${GEN_AI_MODEL_VERSION:-}
|
||||||
@ -122,6 +122,7 @@ services:
|
|||||||
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
|
||||||
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
|
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
|
||||||
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
|
||||||
|
- LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
|
||||||
# Query Options
|
# Query Options
|
||||||
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
|
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
|
||||||
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
|
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
|
||||||
@ -177,9 +178,6 @@ services:
|
|||||||
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
- LOG_LEVEL=${LOG_LEVEL:-info} # Set to debug to get more fine-grained logs
|
||||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-} # Log all of the prompts to the LLM
|
||||||
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
- LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -233,6 +231,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -268,6 +267,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -335,11 +335,6 @@ services:
|
|||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
# local_dynamic_storage is legacy only now
|
|
||||||
local_dynamic_storage:
|
|
||||||
# used to store files uploaded by the user temporarily while we are indexing them
|
|
||||||
# file_connector_tmp_storage is legacy only now
|
|
||||||
file_connector_tmp_storage:
|
|
||||||
db_volume:
|
db_volume:
|
||||||
vespa_volume:
|
vespa_volume:
|
||||||
# Created by the container itself
|
# Created by the container itself
|
||||||
|
@ -21,9 +21,6 @@ services:
|
|||||||
- POSTGRES_HOST=relational_db
|
- POSTGRES_HOST=relational_db
|
||||||
- VESPA_HOST=index
|
- VESPA_HOST=index
|
||||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -53,9 +50,6 @@ services:
|
|||||||
- VESPA_HOST=index
|
- VESPA_HOST=index
|
||||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||||
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -107,6 +101,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -134,6 +129,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -205,11 +201,6 @@ services:
|
|||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
# local_dynamic_storage is legacy only now
|
|
||||||
local_dynamic_storage:
|
|
||||||
# used to store files uploaded by the user temporarily while we are indexing them
|
|
||||||
# file_connector_tmp_storage is legacy only now
|
|
||||||
file_connector_tmp_storage:
|
|
||||||
db_volume:
|
db_volume:
|
||||||
vespa_volume:
|
vespa_volume:
|
||||||
# Created by the container itself
|
# Created by the container itself
|
||||||
|
@ -21,9 +21,6 @@ services:
|
|||||||
- POSTGRES_HOST=relational_db
|
- POSTGRES_HOST=relational_db
|
||||||
- VESPA_HOST=index
|
- VESPA_HOST=index
|
||||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -53,9 +50,6 @@ services:
|
|||||||
- VESPA_HOST=index
|
- VESPA_HOST=index
|
||||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
|
||||||
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
|
||||||
volumes:
|
|
||||||
- local_dynamic_storage:/home/storage
|
|
||||||
- file_connector_tmp_storage:/home/file_connector_storage
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
logging:
|
logging:
|
||||||
@ -87,6 +81,8 @@ services:
|
|||||||
options:
|
options:
|
||||||
max-size: "50m"
|
max-size: "50m"
|
||||||
max-file: "6"
|
max-file: "6"
|
||||||
|
|
||||||
|
|
||||||
relational_db:
|
relational_db:
|
||||||
image: postgres:15.2-alpine
|
image: postgres:15.2-alpine
|
||||||
restart: always
|
restart: always
|
||||||
@ -120,6 +116,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -147,6 +144,7 @@ services:
|
|||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
# Not necessary, this is just to reduce download time during startup
|
||||||
- model_cache_huggingface:/root/.cache/huggingface/
|
- model_cache_huggingface:/root/.cache/huggingface/
|
||||||
logging:
|
logging:
|
||||||
driver: json-file
|
driver: json-file
|
||||||
@ -222,11 +220,6 @@ services:
|
|||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
# local_dynamic_storage is legacy only now
|
|
||||||
local_dynamic_storage:
|
|
||||||
# used to store files uploaded by the user temporarily while we are indexing them
|
|
||||||
# file_connector_tmp_storage is legacy only now
|
|
||||||
file_connector_tmp_storage:
|
|
||||||
db_volume:
|
db_volume:
|
||||||
vespa_volume:
|
vespa_volume:
|
||||||
# Created by the container itself
|
# Created by the container itself
|
||||||
|
@ -52,10 +52,12 @@ const Main = () => {
|
|||||||
{filesAreUploading && <Spinner />}
|
{filesAreUploading && <Spinner />}
|
||||||
<Text className="mb-2">
|
<Text className="mb-2">
|
||||||
Specify files below, click the <b>Upload</b> button, and the contents of
|
Specify files below, click the <b>Upload</b> button, and the contents of
|
||||||
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
|
these files will be searchable via Danswer! Currently supported file
|
||||||
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
|
types include <i>.txt</i>, <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>,{" "}
|
||||||
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
|
<i>.xlsx</i>, <i>.csv</i>, <i>.md</i>, <i>.mdx</i>, <i>.conf</i>,{" "}
|
||||||
file types) are supported.
|
<i>.log</i>, <i>.json</i>, <i>.tsv</i>, <i>.xml</i>, <i>.yml</i>,{" "}
|
||||||
|
<i>.yaml</i>, <i>.eml</i>, <i>.epub</i>, and finally <i>.zip</i> files
|
||||||
|
(containing supported file types).
|
||||||
</Text>
|
</Text>
|
||||||
<Text className="mb-3">
|
<Text className="mb-3">
|
||||||
<b>NOTE:</b> if the original document is accessible via a link, you can
|
<b>NOTE:</b> if the original document is accessible via a link, you can
|
||||||
|
Reference in New Issue
Block a user