diff --git a/backend/tests/regression/answer_quality/README.md b/backend/tests/regression/answer_quality/README.md index 32a8a14705e4..4610a9abc2ef 100644 --- a/backend/tests/regression/answer_quality/README.md +++ b/backend/tests/regression/answer_quality/README.md @@ -15,8 +15,9 @@ This Python script automates the process of running search quality tests for a b ## Usage 1. Ensure you have the required dependencies installed. -2. Configure the `search_test_config.yaml` file with your settings. -3. Navigate to the answer_quality folder: +2. Configure the `search_test_config.yaml` file based on the `search_test_config.yaml.template` file. +3. Configure the `.env_eval` file with the correct environment variables. +4. Navigate to the answer_quality folder: ``` cd danswer/backend/tests/regression/answer_quality ``` @@ -63,6 +64,11 @@ Edit `search_test_config.yaml` to set: - llm Fill this out according to the normal LLM seeding + +To restart the evaluation using a particular index, set the suffix and turn off clean_up_docker_containers. +This also will skip running the evaluation questions, in this case, the relari.py script can be run manually. + + Docker daemon must be running for this to work. Each script is able to be individually run to upload additional docs or run additional tests \ No newline at end of file diff --git a/backend/tests/regression/answer_quality/cli_utils.py b/backend/tests/regression/answer_quality/cli_utils.py index c66cfc1df3cb..a39309efd58f 100644 --- a/backend/tests/regression/answer_quality/cli_utils.py +++ b/backend/tests/regression/answer_quality/cli_utils.py @@ -1,18 +1,51 @@ import json import os import subprocess +import sys +from threading import Thread +from typing import IO from retry import retry -def _run_command(command: str) -> tuple[str, str]: +def _run_command(command: str, stream_output: bool = False) -> tuple[str, str]: process = subprocess.Popen( - command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, ) - stdout, stderr = process.communicate() + + stdout_lines: list[str] = [] + stderr_lines: list[str] = [] + + def process_stream(stream: IO[str], lines: list[str]) -> None: + for line in stream: + lines.append(line) + if stream_output: + print( + line, + end="", + file=sys.stdout if stream == process.stdout else sys.stderr, + ) + + stdout_thread = Thread(target=process_stream, args=(process.stdout, stdout_lines)) + stderr_thread = Thread(target=process_stream, args=(process.stderr, stderr_lines)) + + stdout_thread.start() + stderr_thread.start() + + stdout_thread.join() + stderr_thread.join() + + process.wait() + if process.returncode != 0: - raise RuntimeError(f"Command failed with error: {stderr.decode()}") - return stdout.decode(), stderr.decode() + raise RuntimeError(f"Command failed with error: {''.join(stderr_lines)}") + + return "".join(stdout_lines), "".join(stderr_lines) def get_current_commit_sha() -> str: @@ -92,8 +125,8 @@ def start_docker_compose( print("Docker Command:\n", command) - _run_command(command) - print("The Docker has been Composed :)") + _run_command(command, stream_output=True) + print("Containers have been launched") def cleanup_docker(run_suffix: str) -> None: diff --git a/backend/tests/regression/answer_quality/relari.py b/backend/tests/regression/answer_quality/relari.py index 21e9d3838f25..c669d55c2530 100644 --- a/backend/tests/regression/answer_quality/relari.py +++ b/backend/tests/regression/answer_quality/relari.py @@ -10,51 +10,40 @@ from tests.regression.answer_quality.api_utils import get_answer_from_query from tests.regression.answer_quality.cli_utils import get_current_commit_sha -def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]: +def _get_and_write_relari_outputs( + samples: list[dict], run_suffix: str, output_file_path: str +) -> None: while not check_if_query_ready(run_suffix): time.sleep(5) - relari_outputs = [] - for sample in samples: - retrieved_context, answer = get_answer_from_query( - query=sample["question"], - run_suffix=run_suffix, - ) + with open(output_file_path, "w", encoding="utf-8") as file: + for sample in samples: + retrieved_context, answer = get_answer_from_query( + query=sample["question"], + run_suffix=run_suffix, + ) - relari_outputs.append( - { + if not answer: + print("NO ANSWER GIVEN FOR QUESTION:", sample["question"]) + continue + + output = { "label": sample["uid"], "question": sample["question"], "answer": answer, "retrieved_context": retrieved_context, } - ) - return relari_outputs + file.write(json.dumps(output) + "\n") + file.flush() -def _write_output_file( - relari_outputs: list[dict], output_folder_path: str, run_suffix: str -) -> None: +def _write_metadata_file(run_suffix: str, metadata_file_path: str) -> None: metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix} - counter = 1 - output_file_path = os.path.join(output_folder_path, "results.txt") - metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml") - while os.path.exists(output_file_path) or os.path.exists(metadata_file_path): - output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt") - metadata_file_path = os.path.join( - output_folder_path, f"run_metadata_{counter}.txt" - ) - counter += 1 - print("saving question results to:", output_file_path) print("saving metadata to:", metadata_file_path) with open(metadata_file_path, "w", encoding="utf-8") as yaml_file: yaml.dump(metadata, yaml_file) - with open(output_file_path, "w", encoding="utf-8") as file: - for output in relari_outputs: - file.write(json.dumps(output) + "\n") - file.flush() def _read_questions_jsonl(questions_file_path: str) -> list[dict]: @@ -72,14 +61,32 @@ def answer_relari_questions( run_suffix: str, limit: int | None = None, ) -> None: + results_file = "run_results.jsonl" + metadata_file = "run_metadata.yaml" samples = _read_questions_jsonl(questions_file_path) if limit is not None: samples = samples[:limit] - relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix) + counter = 1 + output_file_path = os.path.join(results_folder_path, results_file) + metadata_file_path = os.path.join(results_folder_path, metadata_file) + while os.path.exists(output_file_path): + output_file_path = os.path.join( + results_folder_path, + results_file.replace("run_results", f"run_results_{counter}"), + ) + metadata_file_path = os.path.join( + results_folder_path, + metadata_file.replace("run_metadata", f"run_metadata_{counter}"), + ) + counter += 1 - _write_output_file(relari_outputs, results_folder_path, run_suffix) + print("saving question results to:", output_file_path) + _write_metadata_file(run_suffix, metadata_file_path) + _get_and_write_relari_outputs( + samples=samples, run_suffix=run_suffix, output_file_path=output_file_path + ) def main() -> None: diff --git a/backend/tests/regression/answer_quality/search_test_config.yaml.template b/backend/tests/regression/answer_quality/search_test_config.yaml.template index 2e8e7ca2728d..47310a3c373c 100644 --- a/backend/tests/regression/answer_quality/search_test_config.yaml.template +++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template @@ -37,7 +37,7 @@ limit: null # LLM configuration llm: # Name of the LLM - name: "llm_name" + name: "default_test_llm" # Provider of the LLM (e.g., OpenAI) provider: "openai" diff --git a/deployment/docker_compose/docker-compose.search-testing.yml b/deployment/docker_compose/docker-compose.search-testing.yml index 186c47ebc3de..41eb50eaf8cf 100644 --- a/deployment/docker_compose/docker-compose.search-testing.yml +++ b/deployment/docker_compose/docker-compose.search-testing.yml @@ -23,6 +23,7 @@ services: - VESPA_HOST=index - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-} + - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-} - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True extra_hosts: - "host.docker.internal:host-gateway" @@ -52,6 +53,7 @@ services: - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server} - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-} - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} + - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-} - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True extra_hosts: - "host.docker.internal:host-gateway"