Eval Script Incremental Write (#1784)

2025-09-29 13:25:50 +02:00 · 2024-07-06 15:43:40 -07:00
parent d35d7ee833
commit f0888f2f61
5 changed files with 88 additions and 40 deletions
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@@ -15,8 +15,9 @@ This Python script automates the process of running search quality tests for a b
 ## Usage
 1. Ensure you have the required dependencies installed.
-2. Configure the `search_test_config.yaml` file with your settings.
+2. Configure the `search_test_config.yaml` file based on the `search_test_config.yaml.template` file.
-3. Navigate to the answer_quality folder:
+3. Configure the `.env_eval` file with the correct environment variables.
 4. Navigate to the answer_quality folder:
 ```
 cd danswer/backend/tests/regression/answer_quality
 ```
@@ -63,6 +64,11 @@ Edit `search_test_config.yaml` to set:
 - llm
    Fill this out according to the normal LLM seeding
 To restart the evaluation using a particular index, set the suffix and turn off clean_up_docker_containers.
 This also will skip running the evaluation questions, in this case, the relari.py script can be run manually.
 Docker daemon must be running for this to work. 
 Each script is able to be individually run to upload additional docs or run additional tests
--- a/backend/tests/regression/answer_quality/cli_utils.py
+++ b/backend/tests/regression/answer_quality/cli_utils.py
@@ -1,18 +1,51 @@
 import json
 import os
 import subprocess
 import sys
 from threading import Thread
 from typing import IO
 from retry import retry
-def _run_command(command: str) -> tuple[str, str]:
+def _run_command(command: str, stream_output: bool = False) -> tuple[str, str]:
    process = subprocess.Popen(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
    )
-    stdout, stderr = process.communicate()
+
    stdout_lines: list[str] = []
    stderr_lines: list[str] = []
    def process_stream(stream: IO[str], lines: list[str]) -> None:
        for line in stream:
            lines.append(line)
            if stream_output:
                print(
                    line,
                    end="",
                    file=sys.stdout if stream == process.stdout else sys.stderr,
                )
    stdout_thread = Thread(target=process_stream, args=(process.stdout, stdout_lines))
    stderr_thread = Thread(target=process_stream, args=(process.stderr, stderr_lines))
    stdout_thread.start()
    stderr_thread.start()
    stdout_thread.join()
    stderr_thread.join()
    process.wait()
    if process.returncode != 0:
-        raise RuntimeError(f"Command failed with error: {stderr.decode()}")
+        raise RuntimeError(f"Command failed with error: {''.join(stderr_lines)}")
-    return stdout.decode(), stderr.decode()
+
    return "".join(stdout_lines), "".join(stderr_lines)
 def get_current_commit_sha() -> str:
@@ -92,8 +125,8 @@ def start_docker_compose(
    print("Docker Command:\n", command)
-    _run_command(command)
+    _run_command(command, stream_output=True)
-    print("The Docker has been Composed :)")
+    print("Containers have been launched")
 def cleanup_docker(run_suffix: str) -> None:
--- a/backend/tests/regression/answer_quality/relari.py
+++ b/backend/tests/regression/answer_quality/relari.py
@@ -10,51 +10,40 @@ from tests.regression.answer_quality.api_utils import get_answer_from_query
 from tests.regression.answer_quality.cli_utils import get_current_commit_sha
-def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]:
+def _get_and_write_relari_outputs(
    samples: list[dict], run_suffix: str, output_file_path: str
 ) -> None:
    while not check_if_query_ready(run_suffix):
        time.sleep(5)
-    relari_outputs = []
+    with open(output_file_path, "w", encoding="utf-8") as file:
        for sample in samples:
            retrieved_context, answer = get_answer_from_query(
                query=sample["question"],
                run_suffix=run_suffix,
            )
-        relari_outputs.append(
+            if not answer:
-            {
+                print("NO ANSWER GIVEN FOR QUESTION:", sample["question"])
                continue
            output = {
                "label": sample["uid"],
                "question": sample["question"],
                "answer": answer,
                "retrieved_context": retrieved_context,
            }
        )
-    return relari_outputs
+            file.write(json.dumps(output) + "\n")
            file.flush()
-def _write_output_file(
+def _write_metadata_file(run_suffix: str, metadata_file_path: str) -> None:
    relari_outputs: list[dict], output_folder_path: str, run_suffix: str
 ) -> None:
    metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix}
    counter = 1
    output_file_path = os.path.join(output_folder_path, "results.txt")
    metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml")
    while os.path.exists(output_file_path) or os.path.exists(metadata_file_path):
        output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt")
        metadata_file_path = os.path.join(
            output_folder_path, f"run_metadata_{counter}.txt"
        )
        counter += 1
    print("saving question results to:", output_file_path)
    print("saving metadata to:", metadata_file_path)
    with open(metadata_file_path, "w", encoding="utf-8") as yaml_file:
        yaml.dump(metadata, yaml_file)
    with open(output_file_path, "w", encoding="utf-8") as file:
        for output in relari_outputs:
            file.write(json.dumps(output) + "\n")
            file.flush()
 def _read_questions_jsonl(questions_file_path: str) -> list[dict]:
@@ -72,14 +61,32 @@ def answer_relari_questions(
    run_suffix: str,
    limit: int | None = None,
 ) -> None:
    results_file = "run_results.jsonl"
    metadata_file = "run_metadata.yaml"
    samples = _read_questions_jsonl(questions_file_path)
    if limit is not None:
        samples = samples[:limit]
-    relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix)
+    counter = 1
    output_file_path = os.path.join(results_folder_path, results_file)
    metadata_file_path = os.path.join(results_folder_path, metadata_file)
    while os.path.exists(output_file_path):
        output_file_path = os.path.join(
            results_folder_path,
            results_file.replace("run_results", f"run_results_{counter}"),
        )
        metadata_file_path = os.path.join(
            results_folder_path,
            metadata_file.replace("run_metadata", f"run_metadata_{counter}"),
        )
        counter += 1
-    _write_output_file(relari_outputs, results_folder_path, run_suffix)
+    print("saving question results to:", output_file_path)
    _write_metadata_file(run_suffix, metadata_file_path)
    _get_and_write_relari_outputs(
        samples=samples, run_suffix=run_suffix, output_file_path=output_file_path
    )
 def main() -> None:
--- a/backend/tests/regression/answer_quality/search_test_config.yaml.template
+++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template
@@ -37,7 +37,7 @@ limit: null
 # LLM configuration
 llm:
  # Name of the LLM
-  name: "llm_name"
+  name: "default_test_llm"
  # Provider of the LLM (e.g., OpenAI)
  provider: "openai"
--- a/deployment/docker_compose/docker-compose.search-testing.yml
+++ b/deployment/docker_compose/docker-compose.search-testing.yml
@@ -23,6 +23,7 @@ services:
      - VESPA_HOST=index
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
      - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
      - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
    extra_hosts:
      - "host.docker.internal:host-gateway"
@@ -52,6 +53,7 @@ services:
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
      - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
      - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
      - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
    extra_hosts:
      - "host.docker.internal:host-gateway"