From e93de602c383cdcdd5cd9fc27b10ad3d983f7972 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Wed, 17 Jul 2024 14:56:24 -0700 Subject: [PATCH] Use SHA instead of branch and save more data (#1850) --- .../tests/regression/answer_quality/README.md | 8 ++-- .../regression/answer_quality/api_utils.py | 12 ++++-- .../regression/answer_quality/cli_utils.py | 20 ++++----- .../answer_quality/run_eval_pipeline.py | 6 +-- .../tests/regression/answer_quality/run_qa.py | 41 +++++++++++++++---- .../search_test_config.yaml.template | 7 ++-- 6 files changed, 63 insertions(+), 31 deletions(-) diff --git a/backend/tests/regression/answer_quality/README.md b/backend/tests/regression/answer_quality/README.md index fa1c2ba44..d7f5b9ae0 100644 --- a/backend/tests/regression/answer_quality/README.md +++ b/backend/tests/regression/answer_quality/README.md @@ -53,8 +53,10 @@ Edit `search_test_config.yaml` to set: - The path to the zip file containing the files you'd like to test against - questions_file - The path to the yaml containing the questions you'd like to test with -- branch - - Set the branch to null if you want it to just use the code as is +- commit_sha + - Set this to the SHA of the commit you want to run the test against + - You must clear all local changes if you want to use this option + - Set this to null if you want it to just use the code as is - clean_up_docker_containers - Set this to true to automatically delete all docker containers, networks and volumes after the test - launch_web_ui @@ -71,7 +73,7 @@ Edit `search_test_config.yaml` to set: - existing_test_suffix - Use this if you would like to relaunch a previous test instance - Input the suffix of the test you'd like to re-launch - - (E.g. to use the data from folder "test_1234_5678" put "_1234_5678") + - (E.g. to use the data from folder "test-1234-5678" put "-1234-5678") - No new files will automatically be uploaded - Leave empty to run a new test - limit diff --git a/backend/tests/regression/answer_quality/api_utils.py b/backend/tests/regression/answer_quality/api_utils.py index 56529e9fe..0f06844cc 100644 --- a/backend/tests/regression/answer_quality/api_utils.py +++ b/backend/tests/regression/answer_quality/api_utils.py @@ -39,7 +39,7 @@ def _create_new_chat_session(run_suffix: str) -> int: raise RuntimeError(response_json) -@retry(tries=15, delay=10, jitter=1) +@retry(tries=10, delay=10) def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]: filters = IndexFilters( source_type=None, @@ -81,10 +81,16 @@ def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]: return simple_search_docs, answer +@retry(tries=10, delay=10) def check_if_query_ready(run_suffix: str) -> bool: url = _api_url_builder(run_suffix, "/manage/admin/connector/indexing-status/") - - indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json() + try: + indexing_status_dict = requests.get(url, headers=GENERAL_HEADERS).json() + except Exception as e: + print("Failed to check indexing status, API server is likely starting up:") + print(f"\t {str(e)}") + print("trying again") + raise e ongoing_index_attempts = False doc_count = 0 diff --git a/backend/tests/regression/answer_quality/cli_utils.py b/backend/tests/regression/answer_quality/cli_utils.py index 47efaf76e..29564694a 100644 --- a/backend/tests/regression/answer_quality/cli_utils.py +++ b/backend/tests/regression/answer_quality/cli_utils.py @@ -60,11 +60,10 @@ def get_current_commit_sha() -> str: return sha -def switch_to_branch(branch: str) -> None: - print(f"Switching to branch: {branch}...") - _run_command(f"git checkout {branch}") - _run_command("git pull") - print(f"Successfully switched to branch: {branch}") +def switch_to_commit(commit_sha: str) -> None: + print(f"Switching to commit: {commit_sha}...") + _run_command(f"git checkout {commit_sha}") + print(f"Successfully switched to commit: {commit_sha}") print("Repository updated successfully.") @@ -77,7 +76,7 @@ def get_docker_container_env_vars(suffix: str) -> dict: combined_env_vars = {} for container_type in ["background", "api_server"]: container_name = _run_command( - f"docker ps -a --format '{{{{.Names}}}}' | grep '{container_type}' | grep '{suffix}'" + f"docker ps -a --format '{{{{.Names}}}}' | awk '/{container_type}/ && /{suffix}/'" )[0].strip() if not container_name: raise RuntimeError( @@ -93,7 +92,6 @@ def get_docker_container_env_vars(suffix: str) -> dict: key, value = env_var.split("=", 1) combined_env_vars[key] = value - print(f"Combined env variables: {combined_env_vars}") return combined_env_vars @@ -117,8 +115,8 @@ def manage_data_directories(suffix: str, base_path: str, use_cloud_gpu: bool) -> os.makedirs(directory, exist_ok=True) os.environ[env_var] = directory print(f"Set {env_var} to: {directory}") - relari_output_path = os.path.join(target_path, "relari_output/") - os.makedirs(relari_output_path, exist_ok=True) + results_output_path = os.path.join(target_path, "evaluations_output/") + os.makedirs(results_output_path, exist_ok=True) def set_env_variables( @@ -287,7 +285,7 @@ def is_vespa_container_healthy(suffix: str) -> bool: # Find the Vespa container stdout, _ = _run_command( - f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}" + f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/" ) container_name = stdout.strip() @@ -313,7 +311,7 @@ def restart_vespa_container(suffix: str) -> None: # Find the Vespa container stdout, _ = _run_command( - f"docker ps -a --format '{{{{.Names}}}}' | grep vespa | grep {suffix}" + f"docker ps -a --format '{{{{.Names}}}}' | awk /vespa/ && /{suffix}/" ) container_name = stdout.strip() diff --git a/backend/tests/regression/answer_quality/run_eval_pipeline.py b/backend/tests/regression/answer_quality/run_eval_pipeline.py index b9c2cadbd..602aedbe6 100644 --- a/backend/tests/regression/answer_quality/run_eval_pipeline.py +++ b/backend/tests/regression/answer_quality/run_eval_pipeline.py @@ -8,7 +8,7 @@ from tests.regression.answer_quality.cli_utils import cleanup_docker from tests.regression.answer_quality.cli_utils import manage_data_directories from tests.regression.answer_quality.cli_utils import set_env_variables from tests.regression.answer_quality.cli_utils import start_docker_compose -from tests.regression.answer_quality.cli_utils import switch_to_branch +from tests.regression.answer_quality.cli_utils import switch_to_commit from tests.regression.answer_quality.file_uploader import upload_test_files from tests.regression.answer_quality.run_qa import run_qa_test_and_save_results @@ -36,8 +36,8 @@ def main() -> None: config.llm, ) manage_data_directories(run_suffix, config.output_folder, config.use_cloud_gpu) - if config.branch: - switch_to_branch(config.branch) + if config.commit_sha: + switch_to_commit(config.commit_sha) start_docker_compose(run_suffix, config.launch_web_ui, config.use_cloud_gpu) diff --git a/backend/tests/regression/answer_quality/run_qa.py b/backend/tests/regression/answer_quality/run_qa.py index 39d4d0b95..b8a4d559a 100644 --- a/backend/tests/regression/answer_quality/run_qa.py +++ b/backend/tests/regression/answer_quality/run_qa.py @@ -1,6 +1,7 @@ import json import multiprocessing import os +import shutil import time import yaml @@ -22,12 +23,12 @@ def _populate_results_file(output_folder_path: str, all_qa_output: list[dict]) - file.flush() -def _update_metadata_file(test_output_folder: str, count: int) -> None: +def _update_metadata_file(test_output_folder: str, invalid_answer_count: int) -> None: metadata_path = os.path.join(test_output_folder, METADATA_FILENAME) with open(metadata_path, "r", encoding="utf-8") as file: metadata = yaml.safe_load(file) - metadata["number_of_questions_asked"] = count + metadata["number_of_failed_questions"] = invalid_answer_count with open(metadata_path, "w", encoding="utf-8") as yaml_file: yaml.dump(metadata, yaml_file) @@ -45,7 +46,7 @@ def _get_test_output_folder(config: dict) -> str: base_output_folder = os.path.expanduser(config["output_folder"]) if config["run_suffix"]: base_output_folder = os.path.join( - base_output_folder, ("test" + config["run_suffix"]), "relari_output" + base_output_folder, ("test" + config["run_suffix"]), "evaluations_output" ) else: base_output_folder = os.path.join(base_output_folder, "no_defined_suffix") @@ -69,7 +70,9 @@ def _get_test_output_folder(config: dict) -> str: def _initialize_files(config: dict) -> tuple[str, list[dict]]: test_output_folder = _get_test_output_folder(config) - questions = _read_questions_jsonl(config["questions_file"]) + questions_file_path = config["questions_file"] + + questions = _read_questions_jsonl(questions_file_path) metadata = { "commit_sha": get_current_commit_sha(), @@ -91,6 +94,23 @@ def _initialize_files(config: dict) -> tuple[str, list[dict]]: with open(metadata_path, "w", encoding="utf-8") as yaml_file: yaml.dump(metadata, yaml_file) + copied_questions_file_path = os.path.join( + test_output_folder, os.path.basename(questions_file_path) + ) + shutil.copy2(questions_file_path, copied_questions_file_path) + + zipped_files_path = config["zipped_documents_file"] + copied_zipped_documents_path = os.path.join( + test_output_folder, os.path.basename(zipped_files_path) + ) + shutil.copy2(zipped_files_path, copied_zipped_documents_path) + + zipped_files_folder = os.path.dirname(zipped_files_path) + jsonl_file_path = os.path.join(zipped_files_folder, "target_docs.jsonl") + if os.path.exists(jsonl_file_path): + copied_jsonl_path = os.path.join(test_output_folder, "target_docs.jsonl") + shutil.copy2(jsonl_file_path, copied_jsonl_path) + return test_output_folder, questions @@ -138,18 +158,23 @@ def _process_and_write_query_results(config: dict) -> None: _populate_results_file(test_output_folder, results) - valid_answer_count = 0 + invalid_answer_count = 0 for result in results: - if result.get("answer"): - valid_answer_count += 1 + if not result.get("answer"): + invalid_answer_count += 1 - _update_metadata_file(test_output_folder, valid_answer_count) + _update_metadata_file(test_output_folder, invalid_answer_count) + + if invalid_answer_count: + print(f"Warning: {invalid_answer_count} questions failed!") + print("Suggest restarting the vespa container and rerunning") time_to_finish = time.time() - start_time minutes, seconds = divmod(int(time_to_finish), 60) print( f"Took {minutes:02d}:{seconds:02d} to ask and answer {len(results)} questions" ) + print("saved test results to folder:", test_output_folder) def run_qa_test_and_save_results(run_suffix: str = "") -> None: diff --git a/backend/tests/regression/answer_quality/search_test_config.yaml.template b/backend/tests/regression/answer_quality/search_test_config.yaml.template index 47310a3c3..553b37f5e 100644 --- a/backend/tests/regression/answer_quality/search_test_config.yaml.template +++ b/backend/tests/regression/answer_quality/search_test_config.yaml.template @@ -10,8 +10,8 @@ zipped_documents_file: "~/sampledocs.zip" # Path to the YAML file containing sample questions questions_file: "~/sample_questions.yaml" -# Git branch to use (null means use current branch as is) -branch: null +# Git commit SHA to use (null means use current code as is) +commit_sha: null # Whether to remove Docker containers after the test clean_up_docker_containers: true @@ -28,7 +28,8 @@ model_server_ip: "PUT_PUBLIC_CLOUD_IP_HERE" # Port of the model server (placeholder) model_server_port: "PUT_PUBLIC_CLOUD_PORT_HERE" -# Suffix for existing test results (empty string means no suffix) +# Suffix for existing test results (E.g. -1234-5678) +# empty string means no suffix existing_test_suffix: "" # Limit on number of tests to run (null means no limit)