mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-08-08 14:02:09 +02:00
Code for ease of eval (#1656)
This commit is contained in:
@@ -52,6 +52,8 @@ SECTION_SEPARATOR = "\n\n"
|
|||||||
# For combining attributes, doesn't have to be unique/perfect to work
|
# For combining attributes, doesn't have to be unique/perfect to work
|
||||||
INDEX_SEPARATOR = "==="
|
INDEX_SEPARATOR = "==="
|
||||||
|
|
||||||
|
# For File Connector Metadata override file
|
||||||
|
DANSWER_METADATA_FILENAME = ".danswer_metadata.json"
|
||||||
|
|
||||||
# Messages
|
# Messages
|
||||||
DISABLED_GEN_AI_MSG = (
|
DISABLED_GEN_AI_MSG = (
|
||||||
|
@@ -86,7 +86,12 @@ def _process_file(
|
|||||||
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
||||||
|
|
||||||
# If this is set, we will show this in the UI as the "name" of the file
|
# If this is set, we will show this in the UI as the "name" of the file
|
||||||
file_display_name_override = all_metadata.get("file_display_name")
|
file_display_name = all_metadata.get("file_display_name") or os.path.basename(
|
||||||
|
file_name
|
||||||
|
)
|
||||||
|
title = (
|
||||||
|
all_metadata["title"] or "" if "title" in all_metadata else file_display_name
|
||||||
|
)
|
||||||
|
|
||||||
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
|
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
|
||||||
if isinstance(time_updated, str):
|
if isinstance(time_updated, str):
|
||||||
@@ -108,6 +113,7 @@ def _process_file(
|
|||||||
"secondary_owners",
|
"secondary_owners",
|
||||||
"filename",
|
"filename",
|
||||||
"file_display_name",
|
"file_display_name",
|
||||||
|
"title",
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -131,8 +137,8 @@ def _process_file(
|
|||||||
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
|
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
|
||||||
],
|
],
|
||||||
source=DocumentSource.FILE,
|
source=DocumentSource.FILE,
|
||||||
semantic_identifier=file_display_name_override
|
semantic_identifier=file_display_name,
|
||||||
or os.path.basename(file_name),
|
title=title,
|
||||||
doc_updated_at=final_time_updated,
|
doc_updated_at=final_time_updated,
|
||||||
primary_owners=p_owners,
|
primary_owners=p_owners,
|
||||||
secondary_owners=s_owners,
|
secondary_owners=s_owners,
|
||||||
|
@@ -16,6 +16,7 @@ import pptx # type: ignore
|
|||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
from pypdf.errors import PdfStreamError
|
from pypdf.errors import PdfStreamError
|
||||||
|
|
||||||
|
from danswer.configs.constants import DANSWER_METADATA_FILENAME
|
||||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@@ -88,7 +89,7 @@ def load_files_from_zip(
|
|||||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||||
zip_metadata = {}
|
zip_metadata = {}
|
||||||
try:
|
try:
|
||||||
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
|
metadata_file_info = zip_file.getinfo(DANSWER_METADATA_FILENAME)
|
||||||
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
||||||
try:
|
try:
|
||||||
zip_metadata = json.load(metadata_file)
|
zip_metadata = json.load(metadata_file)
|
||||||
@@ -96,18 +97,19 @@ def load_files_from_zip(
|
|||||||
# convert list of dicts to dict of dicts
|
# convert list of dicts to dict of dicts
|
||||||
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.warn("Unable to load .danswer_metadata.json")
|
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
|
||||||
except KeyError:
|
except KeyError:
|
||||||
logger.info("No .danswer_metadata.json file")
|
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
|
||||||
|
|
||||||
for file_info in zip_file.infolist():
|
for file_info in zip_file.infolist():
|
||||||
with zip_file.open(file_info.filename, "r") as file:
|
with zip_file.open(file_info.filename, "r") as file:
|
||||||
if ignore_dirs and file_info.is_dir():
|
if ignore_dirs and file_info.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
if (
|
||||||
file_info.filename
|
ignore_macos_resource_fork_files
|
||||||
):
|
and is_macos_resource_fork_file(file_info.filename)
|
||||||
|
) or file_info.filename == DANSWER_METADATA_FILENAME:
|
||||||
continue
|
continue
|
||||||
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
||||||
|
|
||||||
|
@@ -1,23 +1,40 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Usage of the script with optional volume arguments
|
||||||
|
# ./restart_containers.sh [vespa_volume] [postgres_volume]
|
||||||
|
|
||||||
|
VESPA_VOLUME=${1:-""} # Default is empty if not provided
|
||||||
|
POSTGRES_VOLUME=${2:-""} # Default is empty if not provided
|
||||||
|
|
||||||
# Stop and remove the existing containers
|
# Stop and remove the existing containers
|
||||||
echo "Stopping and removing existing containers..."
|
echo "Stopping and removing existing containers..."
|
||||||
docker stop danswer_postgres danswer_vespa
|
docker stop danswer_postgres danswer_vespa
|
||||||
docker rm danswer_postgres danswer_vespa
|
docker rm danswer_postgres danswer_vespa
|
||||||
|
|
||||||
# Start the PostgreSQL container
|
# Start the PostgreSQL container with optional volume
|
||||||
echo "Starting PostgreSQL container..."
|
echo "Starting PostgreSQL container..."
|
||||||
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres
|
if [[ -n "$POSTGRES_VOLUME" ]]; then
|
||||||
|
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres
|
||||||
|
else
|
||||||
|
docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres
|
||||||
|
fi
|
||||||
|
|
||||||
# Start the Vespa container
|
# Start the Vespa container with optional volume
|
||||||
echo "Starting Vespa container..."
|
echo "Starting Vespa container..."
|
||||||
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
|
if [[ -n "$VESPA_VOLUME" ]]; then
|
||||||
|
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8
|
||||||
|
else
|
||||||
|
docker run --detach --name danswer_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
|
||||||
|
fi
|
||||||
|
|
||||||
# Ensure alembic runs in the correct directory
|
# Ensure alembic runs in the correct directory
|
||||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||||
PARENT_DIR="$(dirname "$SCRIPT_DIR")"
|
PARENT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
cd "$PARENT_DIR"
|
cd "$PARENT_DIR"
|
||||||
|
|
||||||
|
# Give Postgres a second to start
|
||||||
|
sleep 1
|
||||||
|
|
||||||
# Run Alembic upgrade
|
# Run Alembic upgrade
|
||||||
echo "Running Alembic migration..."
|
echo "Running Alembic migration..."
|
||||||
alembic upgrade head
|
alembic upgrade head
|
||||||
|
Reference in New Issue
Block a user