Code for ease of eval (#1656)

This commit is contained in:
Yuhong Sun 2024-06-17 20:32:12 -07:00 committed by GitHub
parent 93cc5a9e77
commit c798ade127
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 40 additions and 13 deletions

View File

@ -52,6 +52,8 @@ SECTION_SEPARATOR = "\n\n"
# For combining attributes, doesn't have to be unique/perfect to work
INDEX_SEPARATOR = "==="
# For File Connector Metadata override file
DANSWER_METADATA_FILENAME = ".danswer_metadata.json"
# Messages
DISABLED_GEN_AI_MSG = (

View File

@ -86,7 +86,12 @@ def _process_file(
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
# If this is set, we will show this in the UI as the "name" of the file
file_display_name_override = all_metadata.get("file_display_name")
file_display_name = all_metadata.get("file_display_name") or os.path.basename(
file_name
)
title = (
all_metadata["title"] or "" if "title" in all_metadata else file_display_name
)
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
if isinstance(time_updated, str):
@ -108,6 +113,7 @@ def _process_file(
"secondary_owners",
"filename",
"file_display_name",
"title",
]
}
@ -131,8 +137,8 @@ def _process_file(
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
],
source=DocumentSource.FILE,
semantic_identifier=file_display_name_override
or os.path.basename(file_name),
semantic_identifier=file_display_name,
title=title,
doc_updated_at=final_time_updated,
primary_owners=p_owners,
secondary_owners=s_owners,

View File

@ -16,6 +16,7 @@ import pptx # type: ignore
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.configs.constants import DANSWER_METADATA_FILENAME
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
@ -88,7 +89,7 @@ def load_files_from_zip(
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
metadata_file_info = zip_file.getinfo(DANSWER_METADATA_FILENAME)
with zip_file.open(metadata_file_info, "r") as metadata_file:
try:
zip_metadata = json.load(metadata_file)
@ -96,18 +97,19 @@ def load_files_from_zip(
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json")
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
except KeyError:
logger.info("No .danswer_metadata.json file")
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
if (
ignore_macos_resource_fork_files
and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})

View File

@ -1,23 +1,40 @@
#!/bin/bash
# Usage of the script with optional volume arguments
# ./restart_containers.sh [vespa_volume] [postgres_volume]
VESPA_VOLUME=${1:-""} # Default is empty if not provided
POSTGRES_VOLUME=${2:-""} # Default is empty if not provided
# Stop and remove the existing containers
echo "Stopping and removing existing containers..."
docker stop danswer_postgres danswer_vespa
docker rm danswer_postgres danswer_vespa
# Start the PostgreSQL container
# Start the PostgreSQL container with optional volume
echo "Starting PostgreSQL container..."
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres
if [[ -n "$POSTGRES_VOLUME" ]]; then
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres
else
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres
fi
# Start the Vespa container
# Start the Vespa container with optional volume
echo "Starting Vespa container..."
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
if [[ -n "$VESPA_VOLUME" ]]; then
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8
else
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
fi
# Ensure alembic runs in the correct directory
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PARENT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$PARENT_DIR"
# Give Postgres a second to start
sleep 1
# Run Alembic upgrade
echo "Running Alembic migration..."
alembic upgrade head