Code for ease of eval (#1656)

This commit is contained in:
Yuhong Sun
2024-06-17 20:32:12 -07:00
committed by GitHub
parent 93cc5a9e77
commit c798ade127
4 changed files with 40 additions and 13 deletions

View File

@@ -52,6 +52,8 @@ SECTION_SEPARATOR = "\n\n"
# For combining attributes, doesn't have to be unique/perfect to work # For combining attributes, doesn't have to be unique/perfect to work
INDEX_SEPARATOR = "===" INDEX_SEPARATOR = "==="
# For File Connector Metadata override file
DANSWER_METADATA_FILENAME = ".danswer_metadata.json"
# Messages # Messages
DISABLED_GEN_AI_MSG = ( DISABLED_GEN_AI_MSG = (

View File

@@ -86,7 +86,12 @@ def _process_file(
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
# If this is set, we will show this in the UI as the "name" of the file # If this is set, we will show this in the UI as the "name" of the file
file_display_name_override = all_metadata.get("file_display_name") file_display_name = all_metadata.get("file_display_name") or os.path.basename(
file_name
)
title = (
all_metadata["title"] or "" if "title" in all_metadata else file_display_name
)
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc)) time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
if isinstance(time_updated, str): if isinstance(time_updated, str):
@@ -108,6 +113,7 @@ def _process_file(
"secondary_owners", "secondary_owners",
"filename", "filename",
"file_display_name", "file_display_name",
"title",
] ]
} }
@@ -131,8 +137,8 @@ def _process_file(
Section(link=all_metadata.get("link"), text=file_content_raw.strip()) Section(link=all_metadata.get("link"), text=file_content_raw.strip())
], ],
source=DocumentSource.FILE, source=DocumentSource.FILE,
semantic_identifier=file_display_name_override semantic_identifier=file_display_name,
or os.path.basename(file_name), title=title,
doc_updated_at=final_time_updated, doc_updated_at=final_time_updated,
primary_owners=p_owners, primary_owners=p_owners,
secondary_owners=s_owners, secondary_owners=s_owners,

View File

@@ -16,6 +16,7 @@ import pptx # type: ignore
from pypdf import PdfReader from pypdf import PdfReader
from pypdf.errors import PdfStreamError from pypdf.errors import PdfStreamError
from danswer.configs.constants import DANSWER_METADATA_FILENAME
from danswer.file_processing.html_utils import parse_html_page_basic from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -88,7 +89,7 @@ def load_files_from_zip(
with zipfile.ZipFile(zip_file_io, "r") as zip_file: with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {} zip_metadata = {}
try: try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json") metadata_file_info = zip_file.getinfo(DANSWER_METADATA_FILENAME)
with zip_file.open(metadata_file_info, "r") as metadata_file: with zip_file.open(metadata_file_info, "r") as metadata_file:
try: try:
zip_metadata = json.load(metadata_file) zip_metadata = json.load(metadata_file)
@@ -96,18 +97,19 @@ def load_files_from_zip(
# convert list of dicts to dict of dicts # convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata} zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError: except json.JSONDecodeError:
logger.warn("Unable to load .danswer_metadata.json") logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
except KeyError: except KeyError:
logger.info("No .danswer_metadata.json file") logger.info(f"No {DANSWER_METADATA_FILENAME} file")
for file_info in zip_file.infolist(): for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file: with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir(): if ignore_dirs and file_info.is_dir():
continue continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file( if (
file_info.filename ignore_macos_resource_fork_files
): and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue continue
yield file_info, file, zip_metadata.get(file_info.filename, {}) yield file_info, file, zip_metadata.get(file_info.filename, {})

View File

@@ -1,23 +1,40 @@
#!/bin/bash #!/bin/bash
# Usage of the script with optional volume arguments
# ./restart_containers.sh [vespa_volume] [postgres_volume]
VESPA_VOLUME=${1:-""} # Default is empty if not provided
POSTGRES_VOLUME=${2:-""} # Default is empty if not provided
# Stop and remove the existing containers # Stop and remove the existing containers
echo "Stopping and removing existing containers..." echo "Stopping and removing existing containers..."
docker stop danswer_postgres danswer_vespa docker stop danswer_postgres danswer_vespa
docker rm danswer_postgres danswer_vespa docker rm danswer_postgres danswer_vespa
# Start the PostgreSQL container # Start the PostgreSQL container with optional volume
echo "Starting PostgreSQL container..." echo "Starting PostgreSQL container..."
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres if [[ -n "$POSTGRES_VOLUME" ]]; then
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres
else
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres
fi
# Start the Vespa container # Start the Vespa container with optional volume
echo "Starting Vespa container..." echo "Starting Vespa container..."
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8 if [[ -n "$VESPA_VOLUME" ]]; then
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8
else
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
fi
# Ensure alembic runs in the correct directory # Ensure alembic runs in the correct directory
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PARENT_DIR="$(dirname "$SCRIPT_DIR")" PARENT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$PARENT_DIR" cd "$PARENT_DIR"
# Give Postgres a second to start
sleep 1
# Run Alembic upgrade # Run Alembic upgrade
echo "Running Alembic migration..." echo "Running Alembic migration..."
alembic upgrade head alembic upgrade head