diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 2c28299cc..9e0d318c5 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -52,6 +52,8 @@ SECTION_SEPARATOR = "\n\n" # For combining attributes, doesn't have to be unique/perfect to work INDEX_SEPARATOR = "===" +# For File Connector Metadata override file +DANSWER_METADATA_FILENAME = ".danswer_metadata.json" # Messages DISABLED_GEN_AI_MSG = ( diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 7bed47208..77d01394d 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -86,7 +86,12 @@ def _process_file( all_metadata = {**metadata, **file_metadata} if metadata else file_metadata # If this is set, we will show this in the UI as the "name" of the file - file_display_name_override = all_metadata.get("file_display_name") + file_display_name = all_metadata.get("file_display_name") or os.path.basename( + file_name + ) + title = ( + all_metadata["title"] or "" if "title" in all_metadata else file_display_name + ) time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc)) if isinstance(time_updated, str): @@ -108,6 +113,7 @@ def _process_file( "secondary_owners", "filename", "file_display_name", + "title", ] } @@ -131,8 +137,8 @@ def _process_file( Section(link=all_metadata.get("link"), text=file_content_raw.strip()) ], source=DocumentSource.FILE, - semantic_identifier=file_display_name_override - or os.path.basename(file_name), + semantic_identifier=file_display_name, + title=title, doc_updated_at=final_time_updated, primary_owners=p_owners, secondary_owners=s_owners, diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index bb964141c..14cca7f6f 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -16,6 +16,7 @@ import pptx # type: ignore from pypdf import PdfReader from pypdf.errors import PdfStreamError +from danswer.configs.constants import DANSWER_METADATA_FILENAME from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger @@ -88,7 +89,7 @@ def load_files_from_zip( with zipfile.ZipFile(zip_file_io, "r") as zip_file: zip_metadata = {} try: - metadata_file_info = zip_file.getinfo(".danswer_metadata.json") + metadata_file_info = zip_file.getinfo(DANSWER_METADATA_FILENAME) with zip_file.open(metadata_file_info, "r") as metadata_file: try: zip_metadata = json.load(metadata_file) @@ -96,18 +97,19 @@ def load_files_from_zip( # convert list of dicts to dict of dicts zip_metadata = {d["filename"]: d for d in zip_metadata} except json.JSONDecodeError: - logger.warn("Unable to load .danswer_metadata.json") + logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}") except KeyError: - logger.info("No .danswer_metadata.json file") + logger.info(f"No {DANSWER_METADATA_FILENAME} file") for file_info in zip_file.infolist(): with zip_file.open(file_info.filename, "r") as file: if ignore_dirs and file_info.is_dir(): continue - if ignore_macos_resource_fork_files and is_macos_resource_fork_file( - file_info.filename - ): + if ( + ignore_macos_resource_fork_files + and is_macos_resource_fork_file(file_info.filename) + ) or file_info.filename == DANSWER_METADATA_FILENAME: continue yield file_info, file, zip_metadata.get(file_info.filename, {}) diff --git a/backend/scripts/restart_containers.sh b/backend/scripts/restart_containers.sh index bfd3bd745..c60d1905e 100755 --- a/backend/scripts/restart_containers.sh +++ b/backend/scripts/restart_containers.sh @@ -1,23 +1,40 @@ #!/bin/bash +# Usage of the script with optional volume arguments +# ./restart_containers.sh [vespa_volume] [postgres_volume] + +VESPA_VOLUME=${1:-""} # Default is empty if not provided +POSTGRES_VOLUME=${2:-""} # Default is empty if not provided + # Stop and remove the existing containers echo "Stopping and removing existing containers..." docker stop danswer_postgres danswer_vespa docker rm danswer_postgres danswer_vespa -# Start the PostgreSQL container +# Start the PostgreSQL container with optional volume echo "Starting PostgreSQL container..." -docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres +if [[ -n "$POSTGRES_VOLUME" ]]; then + docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres +else + docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres +fi -# Start the Vespa container +# Start the Vespa container with optional volume echo "Starting Vespa container..." -docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8 +if [[ -n "$VESPA_VOLUME" ]]; then + docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8 +else + docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8 +fi # Ensure alembic runs in the correct directory SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" PARENT_DIR="$(dirname "$SCRIPT_DIR")" cd "$PARENT_DIR" +# Give Postgres a second to start +sleep 1 + # Run Alembic upgrade echo "Running Alembic migration..." alembic upgrade head