Use SHA instead of branch and save more data (#1850)

2025-07-28 13:53:28 +02:00 · 2024-07-17 14:56:24 -07:00
parent 1c77395503
commit e93de602c3
6 changed files with 63 additions and 31 deletions
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@@ -53,8 +53,10 @@ Edit `search_test_config.yaml` to set:
    - The path to the zip file containing the files you'd like to test against
 - questions_file
    - The path to the yaml containing the questions you'd like to test with 
- branch
-    - Set the branch to null if you want it to just use the code as is
+- commit_sha
+    - Set this to the SHA of the commit you want to run the test against
+    - You must clear all local changes if you want to use this option
+    - Set this to null if you want it to just use the code as is
 - clean_up_docker_containers
    - Set this to true to automatically delete all docker containers, networks and volumes after the test
 - launch_web_ui
@@ -71,7 +73,7 @@ Edit `search_test_config.yaml` to set:
 - existing_test_suffix
    - Use this if you would like to relaunch a previous test instance
    - Input the suffix of the test you'd like to re-launch 
-    - (E.g. to use the data from folder "test_1234_5678" put "_1234_5678")
+    - (E.g. to use the data from folder "test-1234-5678" put "-1234-5678")
    - No new files will automatically be uploaded
    - Leave empty to run a new test
 - limit
--- a/backend/tests/regression/answer_quality/api_utils.py
+++ b/backend/tests/regression/answer_quality/api_utils.py
@@ -39,7 +39,7 @@ def _create_new_chat_session(run_suffix: str) -> int:
        raise RuntimeError(response_json)


-@retry(tries=15, delay=10, jitter=1)
+@retry(tries=10, delay=10)
 def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
    filters = IndexFilters(
        source_type=None,
@@ -81,10 +81,16 @@ def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
    return simple_search_docs, answer


+@retry(tries=10, delay=10)
 def check_if_query_ready(run_suffix: str) -> bool:
    url = _api_url_builder(run_suffix, "/manage/admin/connector/indexing-status/")
-
-    indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json()
+    try:
+        indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json()
+    except Exception as e:
+        print("Failed to check indexing status, API server is likely starting up:")
+        print(f"\t {str(e)}")
+        print("trying again")
+        raise e

    ongoing_index_attempts = False
    doc_count = 0
--- a/backend/tests/regression/answer_quality/cli_utils.py
+++ b/backend/tests/regression/answer_quality/cli_utils.py
@@ -60,11 +60,10 @@ def get_current_commit_sha() -> str:
    return sha


-def switch_to_branch(branch: str) -> None:
-    print(f"Switching to branch: {branch}...")
-    _run_command(f"git checkout {branch}")
-    _run_command("git pull")
-    print(f"Successfully switched to branch: {branch}")
+def switch_to_commit(commit_sha: str) -> None:
+    print(f"Switching to commit: {commit_sha}...")
+    _run_command(f"git checkout {commit_sha}")
+    print(f"Successfully switched to commit: {commit_sha}")
    print("Repository updated successfully.")


@@ -77,7 +76,7 @@ def get_docker_container_env_vars(suffix: str) -> dict:
    combined_env_vars = {}
    for container_type in ["background", "api_server"]:
        container_name = _run_command(
-            f"docker ps -a --format '{{{{.Names}}}}' | grep '{container_type}' | grep '{suffix}'"
+            f"docker ps -a --format '{{{{.Names}}}}' | awk '/{container_type}/ && /{suffix}/'"
        )[0].strip()
        if not container_name:
            raise RuntimeError(
@@ -93,7 +92,6 @@ def get_docker_container_env_vars(suffix: str) -> dict:
            key, value = env_var.split("=", 1)
            combined_env_vars[key] = value

-    print(f"Combined env variables: {combined_env_vars}")
    return combined_env_vars


@@ -117,8 +115,8 @@ def manage_data_directories(suffix: str, base_path: str, use_cloud_gpu: bool) ->
        os.makedirs(directory, exist_ok=True)
        os.environ[env_var] = directory
        print(f"Set {env_var} to: {directory}")
-    relari_output_path = os.path.join(target_path, "relari_output/")
-    os.makedirs(relari_output_path, exist_ok=True)
+    results_output_path = os.path.join(target_path, "evaluations_output/")
+    os.makedirs(results_output_path, exist_ok=True)


 def set_env_variables(
@@ -287,7 +285,7 @@ def is_vespa_container_healthy(suffix: str) -> bool:

    # Find the Vespa container
    stdout, _ = _run_command(
-        f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}"
+        f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/"
    )
    container_name = stdout.strip()

@@ -313,7 +311,7 @@ def restart_vespa_container(suffix: str) -> None:

    # Find the Vespa container
    stdout, _ = _run_command(
-        f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}"
+        f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/"
    )
    container_name = stdout.strip()

--- a/backend/tests/regression/answer_quality/run_eval_pipeline.py
+++ b/backend/tests/regression/answer_quality/run_eval_pipeline.py
@@ -8,7 +8,7 @@ from tests.regression.answer_quality.cli_utils import cleanup_docker
 from tests.regression.answer_quality.cli_utils import manage_data_directories
 from tests.regression.answer_quality.cli_utils import set_env_variables
 from tests.regression.answer_quality.cli_utils import start_docker_compose
-from tests.regression.answer_quality.cli_utils import switch_to_branch
+from tests.regression.answer_quality.cli_utils import switch_to_commit
 from tests.regression.answer_quality.file_uploader import upload_test_files
 from tests.regression.answer_quality.run_qa import run_qa_test_and_save_results

@@ -36,8 +36,8 @@ def main() -> None:
        config.llm,
    )
    manage_data_directories(run_suffix, config.output_folder, config.use_cloud_gpu)
-    if config.branch:
-        switch_to_branch(config.branch)
+    if config.commit_sha:
+        switch_to_commit(config.commit_sha)

    start_docker_compose(run_suffix, config.launch_web_ui, config.use_cloud_gpu)

--- a/backend/tests/regression/answer_quality/run_qa.py
+++ b/backend/tests/regression/answer_quality/run_qa.py
@@ -1,6 +1,7 @@
 import json
 import multiprocessing
 import os
+import shutil
 import time

 import yaml
@@ -22,12 +23,12 @@ def _populate_results_file(output_folder_path: str, all_qa_output: list[dict]) -
            file.flush()


-def _update_metadata_file(test_output_folder: str, count: int) -> None:
+def _update_metadata_file(test_output_folder: str, invalid_answer_count: int) -> None:
    metadata_path = os.path.join(test_output_folder, METADATA_FILENAME)
    with open(metadata_path, "r", encoding="utf-8") as file:
        metadata = yaml.safe_load(file)

-    metadata["number_of_questions_asked"] = count
+    metadata["number_of_failed_questions"] = invalid_answer_count
    with open(metadata_path, "w", encoding="utf-8") as yaml_file:
        yaml.dump(metadata, yaml_file)

@@ -45,7 +46,7 @@ def _get_test_output_folder(config: dict) -> str:
    base_output_folder = os.path.expanduser(config["output_folder"])
    if config["run_suffix"]:
        base_output_folder = os.path.join(
-            base_output_folder, ("test" + config["run_suffix"]), "relari_output"
+            base_output_folder, ("test" + config["run_suffix"]), "evaluations_output"
        )
    else:
        base_output_folder = os.path.join(base_output_folder, "no_defined_suffix")
@@ -69,7 +70,9 @@ def _get_test_output_folder(config: dict) -> str:
 def _initialize_files(config: dict) -> tuple[str, list[dict]]:
    test_output_folder = _get_test_output_folder(config)

-    questions = _read_questions_jsonl(config["questions_file"])
+    questions_file_path = config["questions_file"]
+
+    questions = _read_questions_jsonl(questions_file_path)

    metadata = {
        "commit_sha": get_current_commit_sha(),
@@ -91,6 +94,23 @@ def _initialize_files(config: dict) -> tuple[str, list[dict]]:
    with open(metadata_path, "w", encoding="utf-8") as yaml_file:
        yaml.dump(metadata, yaml_file)

+    copied_questions_file_path = os.path.join(
+        test_output_folder, os.path.basename(questions_file_path)
+    )
+    shutil.copy2(questions_file_path, copied_questions_file_path)
+
+    zipped_files_path = config["zipped_documents_file"]
+    copied_zipped_documents_path = os.path.join(
+        test_output_folder, os.path.basename(zipped_files_path)
+    )
+    shutil.copy2(zipped_files_path, copied_zipped_documents_path)
+
+    zipped_files_folder = os.path.dirname(zipped_files_path)
+    jsonl_file_path = os.path.join(zipped_files_folder, "target_docs.jsonl")
+    if os.path.exists(jsonl_file_path):
+        copied_jsonl_path = os.path.join(test_output_folder, "target_docs.jsonl")
+        shutil.copy2(jsonl_file_path, copied_jsonl_path)
+
    return test_output_folder, questions


@@ -138,18 +158,23 @@ def _process_and_write_query_results(config: dict) -> None:

    _populate_results_file(test_output_folder, results)

-    valid_answer_count = 0
+    invalid_answer_count = 0
    for result in results:
-        if result.get("answer"):
-            valid_answer_count += 1
+        if not result.get("answer"):
+            invalid_answer_count += 1

-    _update_metadata_file(test_output_folder, valid_answer_count)
+    _update_metadata_file(test_output_folder, invalid_answer_count)
+
+    if invalid_answer_count:
+        print(f"Warning: {invalid_answer_count} questions failed!")
+        print("Suggest restarting the vespa container and rerunning")

    time_to_finish = time.time() - start_time
    minutes, seconds = divmod(int(time_to_finish), 60)
    print(
        f"Took {minutes:02d}:{seconds:02d} to ask and answer {len(results)} questions"
    )
+    print("saved test results to folder:", test_output_folder)


 def run_qa_test_and_save_results(run_suffix: str = "") -> None:
--- a/backend/tests/regression/answer_quality/search_test_config.yaml.template
+++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template
@@ -10,8 +10,8 @@ zipped_documents_file: "~/sampledocs.zip"
 # Path to the YAML file containing sample questions
 questions_file: "~/sample_questions.yaml"

-# Git branch to use (null means use current branch as is)
-branch: null
+# Git commit SHA to use (null means use current code as is)
+commit_sha: null

 # Whether to remove Docker containers after the test
 clean_up_docker_containers: true
@@ -28,7 +28,8 @@ model_server_ip: "PUT_PUBLIC_CLOUD_IP_HERE"
 # Port of the model server (placeholder)
 model_server_port: "PUT_PUBLIC_CLOUD_PORT_HERE"

-# Suffix for existing test results (empty string means no suffix)
+# Suffix for existing test results (E.g. -1234-5678)
+# empty string means no suffix
 existing_test_suffix: ""

 # Limit on number of tests to run (null means no limit)