Eval Script Incremental Write (#1784)

2025-09-22 17:16:20 +02:00 · 2024-07-06 15:43:40 -07:00
parent d35d7ee833
commit f0888f2f61
5 changed files with 88 additions and 40 deletions
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@@ -15,8 +15,9 @@ This Python script automates the process of running search quality tests for a b
 ## Usage

 1. Ensure you have the required dependencies installed.
-2. Configure the `search_test_config.yaml` file with your settings.
-3. Navigate to the answer_quality folder:
+2. Configure the `search_test_config.yaml` file based on the `search_test_config.yaml.template` file.
+3. Configure the `.env_eval` file with the correct environment variables.
+4. Navigate to the answer_quality folder:
 ```
 cd danswer/backend/tests/regression/answer_quality
 ```
@@ -63,6 +64,11 @@ Edit `search_test_config.yaml` to set:
 - llm
    Fill this out according to the normal LLM seeding

+
+To restart the evaluation using a particular index, set the suffix and turn off clean_up_docker_containers.
+This also will skip running the evaluation questions, in this case, the relari.py script can be run manually.
+
+
 Docker daemon must be running for this to work. 

 Each script is able to be individually run to upload additional docs or run additional tests
--- a/backend/tests/regression/answer_quality/cli_utils.py
+++ b/backend/tests/regression/answer_quality/cli_utils.py
@@ -1,18 +1,51 @@
 import json
 import os
 import subprocess
+import sys
+from threading import Thread
+from typing import IO

 from retry import retry


-def _run_command(command: str) -> tuple[str, str]:
+def _run_command(command: str, stream_output: bool = False) -> tuple[str, str]:
    process = subprocess.Popen(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
    )
-    stdout, stderr = process.communicate()
+
+    stdout_lines: list[str] = []
+    stderr_lines: list[str] = []
+
+    def process_stream(stream: IO[str], lines: list[str]) -> None:
+        for line in stream:
+            lines.append(line)
+            if stream_output:
+                print(
+                    line,
+                    end="",
+                    file=sys.stdout if stream == process.stdout else sys.stderr,
+                )
+
+    stdout_thread = Thread(target=process_stream, args=(process.stdout, stdout_lines))
+    stderr_thread = Thread(target=process_stream, args=(process.stderr, stderr_lines))
+
+    stdout_thread.start()
+    stderr_thread.start()
+
+    stdout_thread.join()
+    stderr_thread.join()
+
+    process.wait()
+
    if process.returncode != 0:
-        raise RuntimeError(f"Command failed with error: {stderr.decode()}")
-    return stdout.decode(), stderr.decode()
+        raise RuntimeError(f"Command failed with error: {''.join(stderr_lines)}")
+
+    return "".join(stdout_lines), "".join(stderr_lines)


 def get_current_commit_sha() -> str:
@@ -92,8 +125,8 @@ def start_docker_compose(

    print("Docker Command:\n", command)

-    _run_command(command)
-    print("The Docker has been Composed :)")
+    _run_command(command, stream_output=True)
+    print("Containers have been launched")


 def cleanup_docker(run_suffix: str) -> None:
--- a/backend/tests/regression/answer_quality/relari.py
+++ b/backend/tests/regression/answer_quality/relari.py
@@ -10,51 +10,40 @@ from tests.regression.answer_quality.api_utils import get_answer_from_query
 from tests.regression.answer_quality.cli_utils import get_current_commit_sha


-def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]:
+def _get_and_write_relari_outputs(
+    samples: list[dict], run_suffix: str, output_file_path: str
+) -> None:
    while not check_if_query_ready(run_suffix):
        time.sleep(5)

-    relari_outputs = []
-    for sample in samples:
-        retrieved_context, answer = get_answer_from_query(
-            query=sample["question"],
-            run_suffix=run_suffix,
-        )
+    with open(output_file_path, "w", encoding="utf-8") as file:
+        for sample in samples:
+            retrieved_context, answer = get_answer_from_query(
+                query=sample["question"],
+                run_suffix=run_suffix,
+            )

-        relari_outputs.append(
-            {
+            if not answer:
+                print("NO ANSWER GIVEN FOR QUESTION:", sample["question"])
+                continue
+
+            output = {
                "label": sample["uid"],
                "question": sample["question"],
                "answer": answer,
                "retrieved_context": retrieved_context,
            }
-        )

-    return relari_outputs
+            file.write(json.dumps(output) + "\n")
+            file.flush()


-def _write_output_file(
-    relari_outputs: list[dict], output_folder_path: str, run_suffix: str
-) -> None:
+def _write_metadata_file(run_suffix: str, metadata_file_path: str) -> None:
    metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix}

-    counter = 1
-    output_file_path = os.path.join(output_folder_path, "results.txt")
-    metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml")
-    while os.path.exists(output_file_path) or os.path.exists(metadata_file_path):
-        output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt")
-        metadata_file_path = os.path.join(
-            output_folder_path, f"run_metadata_{counter}.txt"
-        )
-        counter += 1
-    print("saving question results to:", output_file_path)
    print("saving metadata to:", metadata_file_path)
    with open(metadata_file_path, "w", encoding="utf-8") as yaml_file:
        yaml.dump(metadata, yaml_file)
-    with open(output_file_path, "w", encoding="utf-8") as file:
-        for output in relari_outputs:
-            file.write(json.dumps(output) + "\n")
-            file.flush()


 def _read_questions_jsonl(questions_file_path: str) -> list[dict]:
@@ -72,14 +61,32 @@ def answer_relari_questions(
    run_suffix: str,
    limit: int | None = None,
 ) -> None:
+    results_file = "run_results.jsonl"
+    metadata_file = "run_metadata.yaml"
    samples = _read_questions_jsonl(questions_file_path)

    if limit is not None:
        samples = samples[:limit]

-    relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix)
+    counter = 1
+    output_file_path = os.path.join(results_folder_path, results_file)
+    metadata_file_path = os.path.join(results_folder_path, metadata_file)
+    while os.path.exists(output_file_path):
+        output_file_path = os.path.join(
+            results_folder_path,
+            results_file.replace("run_results", f"run_results_{counter}"),
+        )
+        metadata_file_path = os.path.join(
+            results_folder_path,
+            metadata_file.replace("run_metadata", f"run_metadata_{counter}"),
+        )
+        counter += 1

-    _write_output_file(relari_outputs, results_folder_path, run_suffix)
+    print("saving question results to:", output_file_path)
+    _write_metadata_file(run_suffix, metadata_file_path)
+    _get_and_write_relari_outputs(
+        samples=samples, run_suffix=run_suffix, output_file_path=output_file_path
+    )


 def main() -> None:
--- a/backend/tests/regression/answer_quality/search_test_config.yaml.template
+++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template
@@ -37,7 +37,7 @@ limit: null
 # LLM configuration
 llm:
  # Name of the LLM
-  name: "llm_name"
+  name: "default_test_llm"
  
  # Provider of the LLM (e.g., OpenAI)
  provider: "openai"
--- a/deployment/docker_compose/docker-compose.search-testing.yml
+++ b/deployment/docker_compose/docker-compose.search-testing.yml
@@ -23,6 +23,7 @@ services:
      - VESPA_HOST=index
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
+      - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
      - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
    extra_hosts:
      - "host.docker.internal:host-gateway"
@@ -52,6 +53,7 @@ services:
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
      - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
+      - ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
      - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
    extra_hosts:
      - "host.docker.internal:host-gateway"