From e93de602c383cdcdd5cd9fc27b10ad3d983f7972 Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Wed, 17 Jul 2024 14:56:24 -0700
Subject: [PATCH] Use SHA instead of branch and save more data (#1850)

---
 .../tests/regression/answer_quality/README.md |  8 ++--
 .../regression/answer_quality/api_utils.py    | 12 ++++--
 .../regression/answer_quality/cli_utils.py    | 20 ++++-----
 .../answer_quality/run_eval_pipeline.py       |  6 +--
 .../tests/regression/answer_quality/run_qa.py | 41 +++++++++++++++----
 .../search_test_config.yaml.template          |  7 ++--
 6 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/backend/tests/regression/answer_quality/README.md b/backend/tests/regression/answer_quality/README.md
index fa1c2ba44..d7f5b9ae0 100644
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@@ -53,8 +53,10 @@ Edit `search_test_config.yaml` to set:
     - The path to the zip file containing the files you'd like to test against
 - questions_file
     - The path to the yaml containing the questions you'd like to test with 
-- branch
-    - Set the branch to null if you want it to just use the code as is
+- commit_sha
+    - Set this to the SHA of the commit you want to run the test against
+    - You must clear all local changes if you want to use this option
+    - Set this to null if you want it to just use the code as is
 - clean_up_docker_containers
     - Set this to true to automatically delete all docker containers, networks and volumes after the test
 - launch_web_ui
@@ -71,7 +73,7 @@ Edit `search_test_config.yaml` to set:
 - existing_test_suffix
     - Use this if you would like to relaunch a previous test instance
     - Input the suffix of the test you'd like to re-launch 
-    - (E.g. to use the data from folder "test_1234_5678" put "_1234_5678")
+    - (E.g. to use the data from folder "test-1234-5678" put "-1234-5678")
     - No new files will automatically be uploaded
     - Leave empty to run a new test
 - limit
diff --git a/backend/tests/regression/answer_quality/api_utils.py b/backend/tests/regression/answer_quality/api_utils.py
index 56529e9fe..0f06844cc 100644
--- a/backend/tests/regression/answer_quality/api_utils.py
+++ b/backend/tests/regression/answer_quality/api_utils.py
@@ -39,7 +39,7 @@ def _create_new_chat_session(run_suffix: str) -> int:
         raise RuntimeError(response_json)
 
 
-@retry(tries=15, delay=10, jitter=1)
+@retry(tries=10, delay=10)
 def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
     filters = IndexFilters(
         source_type=None,
@@ -81,10 +81,16 @@ def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
     return simple_search_docs, answer
 
 
+@retry(tries=10, delay=10)
 def check_if_query_ready(run_suffix: str) -> bool:
     url = _api_url_builder(run_suffix, "/manage/admin/connector/indexing-status/")
-
-    indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json()
+    try:
+        indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json()
+    except Exception as e:
+        print("Failed to check indexing status, API server is likely starting up:")
+        print(f"\t {str(e)}")
+        print("trying again")
+        raise e
 
     ongoing_index_attempts = False
     doc_count = 0
diff --git a/backend/tests/regression/answer_quality/cli_utils.py b/backend/tests/regression/answer_quality/cli_utils.py
index 47efaf76e..29564694a 100644
--- a/backend/tests/regression/answer_quality/cli_utils.py
+++ b/backend/tests/regression/answer_quality/cli_utils.py
@@ -60,11 +60,10 @@ def get_current_commit_sha() -> str:
     return sha
 
 
-def switch_to_branch(branch: str) -> None:
-    print(f"Switching to branch: {branch}...")
-    _run_command(f"git checkout {branch}")
-    _run_command("git pull")
-    print(f"Successfully switched to branch: {branch}")
+def switch_to_commit(commit_sha: str) -> None:
+    print(f"Switching to commit: {commit_sha}...")
+    _run_command(f"git checkout {commit_sha}")
+    print(f"Successfully switched to commit: {commit_sha}")
     print("Repository updated successfully.")
 
 
@@ -77,7 +76,7 @@ def get_docker_container_env_vars(suffix: str) -> dict:
     combined_env_vars = {}
     for container_type in ["background", "api_server"]:
         container_name = _run_command(
-            f"docker ps -a --format '{{{{.Names}}}}' | grep '{container_type}' | grep '{suffix}'"
+            f"docker ps -a --format '{{{{.Names}}}}' | awk '/{container_type}/ && /{suffix}/'"
         )[0].strip()
         if not container_name:
             raise RuntimeError(
@@ -93,7 +92,6 @@ def get_docker_container_env_vars(suffix: str) -> dict:
             key, value = env_var.split("=", 1)
             combined_env_vars[key] = value
 
-    print(f"Combined env variables: {combined_env_vars}")
     return combined_env_vars
 
 
@@ -117,8 +115,8 @@ def manage_data_directories(suffix: str, base_path: str, use_cloud_gpu: bool) ->
         os.makedirs(directory, exist_ok=True)
         os.environ[env_var] = directory
         print(f"Set {env_var} to: {directory}")
-    relari_output_path = os.path.join(target_path, "relari_output/")
-    os.makedirs(relari_output_path, exist_ok=True)
+    results_output_path = os.path.join(target_path, "evaluations_output/")
+    os.makedirs(results_output_path, exist_ok=True)
 
 
 def set_env_variables(
@@ -287,7 +285,7 @@ def is_vespa_container_healthy(suffix: str) -> bool:
 
     # Find the Vespa container
     stdout, _ = _run_command(
-        f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}"
+        f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/"
     )
     container_name = stdout.strip()
 
@@ -313,7 +311,7 @@ def restart_vespa_container(suffix: str) -> None:
 
     # Find the Vespa container
     stdout, _ = _run_command(
-        f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}"
+        f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/"
     )
     container_name = stdout.strip()
 
diff --git a/backend/tests/regression/answer_quality/run_eval_pipeline.py b/backend/tests/regression/answer_quality/run_eval_pipeline.py
index b9c2cadbd..602aedbe6 100644
--- a/backend/tests/regression/answer_quality/run_eval_pipeline.py
+++ b/backend/tests/regression/answer_quality/run_eval_pipeline.py
@@ -8,7 +8,7 @@ from tests.regression.answer_quality.cli_utils import cleanup_docker
 from tests.regression.answer_quality.cli_utils import manage_data_directories
 from tests.regression.answer_quality.cli_utils import set_env_variables
 from tests.regression.answer_quality.cli_utils import start_docker_compose
-from tests.regression.answer_quality.cli_utils import switch_to_branch
+from tests.regression.answer_quality.cli_utils import switch_to_commit
 from tests.regression.answer_quality.file_uploader import upload_test_files
 from tests.regression.answer_quality.run_qa import run_qa_test_and_save_results
 
@@ -36,8 +36,8 @@ def main() -> None:
         config.llm,
     )
     manage_data_directories(run_suffix, config.output_folder, config.use_cloud_gpu)
-    if config.branch:
-        switch_to_branch(config.branch)
+    if config.commit_sha:
+        switch_to_commit(config.commit_sha)
 
     start_docker_compose(run_suffix, config.launch_web_ui, config.use_cloud_gpu)
 
diff --git a/backend/tests/regression/answer_quality/run_qa.py b/backend/tests/regression/answer_quality/run_qa.py
index 39d4d0b95..b8a4d559a 100644
--- a/backend/tests/regression/answer_quality/run_qa.py
+++ b/backend/tests/regression/answer_quality/run_qa.py
@@ -1,6 +1,7 @@
 import json
 import multiprocessing
 import os
+import shutil
 import time
 
 import yaml
@@ -22,12 +23,12 @@ def _populate_results_file(output_folder_path: str, all_qa_output: list[dict]) -
             file.flush()
 
 
-def _update_metadata_file(test_output_folder: str, count: int) -> None:
+def _update_metadata_file(test_output_folder: str, invalid_answer_count: int) -> None:
     metadata_path = os.path.join(test_output_folder, METADATA_FILENAME)
     with open(metadata_path, "r", encoding="utf-8") as file:
         metadata = yaml.safe_load(file)
 
-    metadata["number_of_questions_asked"] = count
+    metadata["number_of_failed_questions"] = invalid_answer_count
     with open(metadata_path, "w", encoding="utf-8") as yaml_file:
         yaml.dump(metadata, yaml_file)
 
@@ -45,7 +46,7 @@ def _get_test_output_folder(config: dict) -> str:
     base_output_folder = os.path.expanduser(config["output_folder"])
     if config["run_suffix"]:
         base_output_folder = os.path.join(
-            base_output_folder, ("test" + config["run_suffix"]), "relari_output"
+            base_output_folder, ("test" + config["run_suffix"]), "evaluations_output"
         )
     else:
         base_output_folder = os.path.join(base_output_folder, "no_defined_suffix")
@@ -69,7 +70,9 @@ def _get_test_output_folder(config: dict) -> str:
 def _initialize_files(config: dict) -> tuple[str, list[dict]]:
     test_output_folder = _get_test_output_folder(config)
 
-    questions = _read_questions_jsonl(config["questions_file"])
+    questions_file_path = config["questions_file"]
+
+    questions = _read_questions_jsonl(questions_file_path)
 
     metadata = {
         "commit_sha": get_current_commit_sha(),
@@ -91,6 +94,23 @@ def _initialize_files(config: dict) -> tuple[str, list[dict]]:
     with open(metadata_path, "w", encoding="utf-8") as yaml_file:
         yaml.dump(metadata, yaml_file)
 
+    copied_questions_file_path = os.path.join(
+        test_output_folder, os.path.basename(questions_file_path)
+    )
+    shutil.copy2(questions_file_path, copied_questions_file_path)
+
+    zipped_files_path = config["zipped_documents_file"]
+    copied_zipped_documents_path = os.path.join(
+        test_output_folder, os.path.basename(zipped_files_path)
+    )
+    shutil.copy2(zipped_files_path, copied_zipped_documents_path)
+
+    zipped_files_folder = os.path.dirname(zipped_files_path)
+    jsonl_file_path = os.path.join(zipped_files_folder, "target_docs.jsonl")
+    if os.path.exists(jsonl_file_path):
+        copied_jsonl_path = os.path.join(test_output_folder, "target_docs.jsonl")
+        shutil.copy2(jsonl_file_path, copied_jsonl_path)
+
     return test_output_folder, questions
 
 
@@ -138,18 +158,23 @@ def _process_and_write_query_results(config: dict) -> None:
 
     _populate_results_file(test_output_folder, results)
 
-    valid_answer_count = 0
+    invalid_answer_count = 0
     for result in results:
-        if result.get("answer"):
-            valid_answer_count += 1
+        if not result.get("answer"):
+            invalid_answer_count += 1
 
-    _update_metadata_file(test_output_folder, valid_answer_count)
+    _update_metadata_file(test_output_folder, invalid_answer_count)
+
+    if invalid_answer_count:
+        print(f"Warning: {invalid_answer_count} questions failed!")
+        print("Suggest restarting the vespa container and rerunning")
 
     time_to_finish = time.time() - start_time
     minutes, seconds = divmod(int(time_to_finish), 60)
     print(
         f"Took {minutes:02d}:{seconds:02d} to ask and answer {len(results)} questions"
     )
+    print("saved test results to folder:", test_output_folder)
 
 
 def run_qa_test_and_save_results(run_suffix: str = "") -> None:
diff --git a/backend/tests/regression/answer_quality/search_test_config.yaml.template b/backend/tests/regression/answer_quality/search_test_config.yaml.template
index 47310a3c3..553b37f5e 100644
--- a/backend/tests/regression/answer_quality/search_test_config.yaml.template
+++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template
@@ -10,8 +10,8 @@ zipped_documents_file: "~/sampledocs.zip"
 # Path to the YAML file containing sample questions
 questions_file: "~/sample_questions.yaml"
 
-# Git branch to use (null means use current branch as is)
-branch: null
+# Git commit SHA to use (null means use current code as is)
+commit_sha: null
 
 # Whether to remove Docker containers after the test
 clean_up_docker_containers: true
@@ -28,7 +28,8 @@ model_server_ip: "PUT_PUBLIC_CLOUD_IP_HERE"
 # Port of the model server (placeholder)
 model_server_port: "PUT_PUBLIC_CLOUD_PORT_HERE"
 
-# Suffix for existing test results (empty string means no suffix)
+# Suffix for existing test results (E.g. -1234-5678)
+# empty string means no suffix
 existing_test_suffix: ""
 
 # Limit on number of tests to run (null means no limit)