Added search quality testing pipeline (#1774)

2025-07-28 13:53:28 +02:00 · 2024-07-06 11:51:50 -07:00
parent de4d8e9a65
commit ac14369716
11 changed files with 1123 additions and 115 deletions
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@@ -0,0 +1,68 @@
+# Search Quality Test Script
+
+This Python script automates the process of running search quality tests for a backend system.
+
+## Features
+
+- Loads configuration from a YAML file
+- Sets up Docker environment
+- Manages environment variables
+- Switches to specified Git branch
+- Uploads test documents
+- Runs search quality tests using Relari
+- Cleans up Docker containers (optional)
+
+## Usage
+
+1. Ensure you have the required dependencies installed.
+2. Configure the `search_test_config.yaml` file with your settings.
+3. Navigate to the answer_quality folder:
+```
+cd danswer/backend/tests/regression/answer_quality
+```
+4. Run the script:
+```
+python search_quality_test.py
+```
+
+## Configuration
+
+Edit `search_test_config.yaml` to set:
+
+- output_folder
+    This is the folder where the folders for each test will go 
+    These folders will contain the postgres/vespa data as well as the results for each test
+- zipped_documents_file
+    The path to the zip file containing the files you'd like to test against
+- questions_file
+    The path to the yaml containing the questions you'd like to test with 
+- branch
+    Set the branch to null if you want it to just use the code as is
+- clean_up_docker_containers
+    Set this to true to automatically delete all docker containers, networks and volumes after the test
+- launch_web_ui
+    Set this to true if you want to use the UI during/after the testing process
+- use_cloud_gpu
+    Set to true or false depending on if you want to use the remote gpu
+    Only need to set this if use_cloud_gpu is true
+- model_server_ip
+    This is the ip of the remote model server
+    Only need to set this if use_cloud_gpu is true   
+- model_server_port
+    This is the port of the remote model server
+    Only need to set this if use_cloud_gpu is true
+- existing_test_suffix
+    Use this if you would like to relaunch a previous test instance
+    Input the suffix of the test you'd like to re-launch 
+    (E.g. to use the data from folder "test_1234_5678" put "_1234_5678")
+    No new files will automatically be uploaded
+    Leave empty to run a new test
+- limit
+    Max number of questions you'd like to ask against the dataset
+    Set to null for no limit
+- llm
+    Fill this out according to the normal LLM seeding
+
+Docker daemon must be running for this to work. 
+
+Each script is able to be individually run to upload additional docs or run additional tests
--- a/backend/tests/regression/answer_quality/api_utils.py
+++ b/backend/tests/regression/answer_quality/api_utils.py
@@ -0,0 +1,220 @@
+import requests
+from retry import retry
+
+from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import MessageType
+from danswer.connectors.models import InputType
+from danswer.db.enums import IndexingStatus
+from danswer.one_shot_answer.models import DirectQARequest
+from danswer.one_shot_answer.models import ThreadMessage
+from danswer.search.models import IndexFilters
+from danswer.search.models import OptionalSearchSetting
+from danswer.search.models import RetrievalDetails
+from danswer.server.documents.models import ConnectorBase
+from tests.regression.answer_quality.cli_utils import (
+    get_api_server_host_port,
+)
+
+
+def _api_url_builder(run_suffix: str, api_path: str) -> str:
+    return f"http://localhost:{get_api_server_host_port(run_suffix)}" + api_path
+
+
+@retry(tries=5, delay=2, backoff=2)
+def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
+    filters = IndexFilters(
+        source_type=None,
+        document_set=None,
+        time_cutoff=None,
+        tags=None,
+        access_control_list=None,
+    )
+
+    messages = [ThreadMessage(message=query, sender=None, role=MessageType.USER)]
+
+    new_message_request = DirectQARequest(
+        messages=messages,
+        prompt_id=0,
+        persona_id=0,
+        retrieval_options=RetrievalDetails(
+            run_search=OptionalSearchSetting.ALWAYS,
+            real_time=True,
+            filters=filters,
+            enable_auto_detect_filters=False,
+        ),
+        chain_of_thought=False,
+        return_contexts=True,
+    )
+
+    url = _api_url_builder(run_suffix, "/query/answer-with-quote/")
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    body = new_message_request.dict()
+    body["user"] = None
+    try:
+        response_json = requests.post(url, headers=headers, json=body).json()
+        content_list = [
+            context.get("content", "")
+            for context in response_json.get("contexts", {}).get("contexts", [])
+        ]
+        answer = response_json.get("answer")
+    except Exception as e:
+        print("Failed to answer the questions, trying again")
+        print(f"error: {str(e)}")
+        raise e
+
+    print("\nquery: ", query)
+    print("answer: ", answer)
+    print("content_list: ", content_list)
+
+    return content_list, answer
+
+
+def check_if_query_ready(run_suffix: str) -> bool:
+    url = _api_url_builder(run_suffix, "/manage/admin/connector/indexing-status/")
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    indexing_status_dict = requests.get(url, headers=headers).json()
+
+    ongoing_index_attempts = False
+    doc_count = 0
+    for index_attempt in indexing_status_dict:
+        status = index_attempt["last_status"]
+        if status == IndexingStatus.IN_PROGRESS or status == IndexingStatus.NOT_STARTED:
+            ongoing_index_attempts = True
+        doc_count += index_attempt["docs_indexed"]
+
+    if not doc_count:
+        print("No docs indexed, waiting for indexing to start")
+    elif ongoing_index_attempts:
+        print(
+            f"{doc_count} docs indexed but waiting for ongoing indexing jobs to finish..."
+        )
+
+    return doc_count > 0 and not ongoing_index_attempts
+
+
+def run_cc_once(run_suffix: str, connector_id: int, credential_id: int) -> None:
+    url = _api_url_builder(run_suffix, "/manage/admin/connector/run-once/")
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    body = {
+        "connector_id": connector_id,
+        "credential_ids": [credential_id],
+        "from_beginning": True,
+    }
+    print("body:", body)
+    response = requests.post(url, headers=headers, json=body)
+    if response.status_code == 200:
+        print("Connector created successfully:", response.json())
+    else:
+        print("Failed status_code:", response.status_code)
+        print("Failed text:", response.text)
+
+
+def create_cc_pair(run_suffix: str, connector_id: int, credential_id: int) -> None:
+    url = _api_url_builder(
+        run_suffix, f"/manage/connector/{connector_id}/credential/{credential_id}"
+    )
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    body = {"name": "zip_folder_contents", "is_public": True}
+    print("body:", body)
+    response = requests.put(url, headers=headers, json=body)
+    if response.status_code == 200:
+        print("Connector created successfully:", response.json())
+    else:
+        print("Failed status_code:", response.status_code)
+        print("Failed text:", response.text)
+
+
+def _get_existing_connector_names(run_suffix: str) -> list[str]:
+    url = _api_url_builder(run_suffix, "/manage/connector")
+    headers = {
+        "Content-Type": "application/json",
+    }
+    body = {
+        "credential_json": {},
+        "admin_public": True,
+    }
+    response = requests.get(url, headers=headers, json=body)
+    if response.status_code == 200:
+        connectors = response.json()
+        return [connector["name"] for connector in connectors]
+    else:
+        raise RuntimeError(response.__dict__)
+
+
+def create_connector(run_suffix: str, file_paths: list[str]) -> int:
+    url = _api_url_builder(run_suffix, "/manage/admin/connector")
+    headers = {
+        "Content-Type": "application/json",
+    }
+    connector_name = base_connector_name = "search_eval_connector"
+    existing_connector_names = _get_existing_connector_names(run_suffix)
+
+    count = 1
+    while connector_name in existing_connector_names:
+        connector_name = base_connector_name + "_" + str(count)
+        count += 1
+
+    connector = ConnectorBase(
+        name=connector_name,
+        source=DocumentSource.FILE,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={"file_locations": file_paths},
+        refresh_freq=None,
+        prune_freq=None,
+        disabled=False,
+    )
+
+    body = connector.dict()
+    print("body:", body)
+    response = requests.post(url, headers=headers, json=body)
+    if response.status_code == 200:
+        print("Connector created successfully:", response.json())
+        return response.json()["id"]
+    else:
+        raise RuntimeError(response.__dict__)
+
+
+def create_credential(run_suffix: str) -> int:
+    url = _api_url_builder(run_suffix, "/manage/credential")
+    headers = {
+        "Content-Type": "application/json",
+    }
+    body = {
+        "credential_json": {},
+        "admin_public": True,
+    }
+    response = requests.post(url, headers=headers, json=body)
+    if response.status_code == 200:
+        print("credential created successfully:", response.json())
+        return response.json()["id"]
+    else:
+        raise RuntimeError(response.__dict__)
+
+
+@retry(tries=10, delay=2, backoff=2)
+def upload_file(run_suffix: str, zip_file_path: str) -> list[str]:
+    files = [
+        ("files", open(zip_file_path, "rb")),
+    ]
+
+    api_path = _api_url_builder(run_suffix, "/manage/admin/connector/file/upload")
+    try:
+        response = requests.post(api_path, files=files)
+        response.raise_for_status()  # Raises an HTTPError for bad responses
+        print("file uploaded successfully:", response.json())
+        return response.json()["file_paths"]
+    except Exception as e:
+        print("File upload failed, waiting for API server to come up and trying again")
+        raise e
--- a/backend/tests/regression/answer_quality/cli_utils.py
+++ b/backend/tests/regression/answer_quality/cli_utils.py
@@ -0,0 +1,203 @@
+import json
+import os
+import subprocess
+
+from retry import retry
+
+
+def _run_command(command: str) -> tuple[str, str]:
+    process = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        raise RuntimeError(f"Command failed with error: {stderr.decode()}")
+    return stdout.decode(), stderr.decode()
+
+
+def get_current_commit_sha() -> str:
+    print("Getting current commit SHA...")
+    stdout, _ = _run_command("git rev-parse HEAD")
+    sha = stdout.strip()
+    print(f"Current commit SHA: {sha}")
+    return sha
+
+
+def switch_to_branch(branch: str) -> None:
+    print(f"Switching to branch: {branch}...")
+    _run_command(f"git checkout {branch}")
+    _run_command("git pull")
+    print(f"Successfully switched to branch: {branch}")
+    print("Repository updated successfully.")
+
+
+def manage_data_directories(suffix: str, base_path: str, use_cloud_gpu: bool) -> str:
+    # Use the user's home directory as the base path
+    target_path = os.path.join(os.path.expanduser(base_path), f"test{suffix}")
+    directories = {
+        "DANSWER_POSTGRES_DATA_DIR": os.path.join(target_path, "postgres/"),
+        "DANSWER_VESPA_DATA_DIR": os.path.join(target_path, "vespa/"),
+    }
+    if not use_cloud_gpu:
+        directories["DANSWER_INDEX_MODEL_CACHE_DIR"] = os.path.join(
+            target_path, "index_model_cache/"
+        )
+        directories["DANSWER_INFERENCE_MODEL_CACHE_DIR"] = os.path.join(
+            target_path, "inference_model_cache/"
+        )
+
+    # Create directories if they don't exist
+    for env_var, directory in directories.items():
+        os.makedirs(directory, exist_ok=True)
+        os.environ[env_var] = directory
+        print(f"Set {env_var} to: {directory}")
+    relari_output_path = os.path.join(target_path, "relari_output/")
+    os.makedirs(relari_output_path, exist_ok=True)
+    return relari_output_path
+
+
+def set_env_variables(
+    remote_server_ip: str,
+    remote_server_port: str,
+    use_cloud_gpu: bool,
+    llm_config: dict,
+) -> None:
+    env_vars: dict = {}
+    env_vars["ENV_SEED_CONFIGURATION"] = json.dumps({"llms": [llm_config]})
+    env_vars["ENABLE_PAID_ENTERPRISE_EDITION_FEATURES"] = "true"
+    if use_cloud_gpu:
+        env_vars["MODEL_SERVER_HOST"] = remote_server_ip
+        env_vars["MODEL_SERVER_PORT"] = remote_server_port
+
+    for env_var_name, env_var in env_vars.items():
+        os.environ[env_var_name] = env_var
+        print(f"Set {env_var_name} to: {env_var}")
+
+
+def start_docker_compose(
+    run_suffix: str, launch_web_ui: bool, use_cloud_gpu: bool
+) -> None:
+    print("Starting Docker Compose...")
+    os.chdir(os.path.expanduser("~/danswer/deployment/docker_compose"))
+    command = f"docker compose -f docker-compose.search-testing.yml -p danswer-stack{run_suffix} up -d"
+    command += " --build"
+    command += " --pull always"
+    command += " --force-recreate"
+    if not launch_web_ui:
+        command += " --scale web_server=0"
+        command += " --scale nginx=0"
+    if use_cloud_gpu:
+        command += " --scale indexing_model_server=0"
+        command += " --scale inference_model_server=0"
+
+    print("Docker Command:\n", command)
+
+    _run_command(command)
+    print("The Docker has been Composed :)")
+
+
+def cleanup_docker(run_suffix: str) -> None:
+    print(
+        f"Deleting Docker containers, volumes, and networks for project suffix: {run_suffix}"
+    )
+
+    stdout, _ = _run_command("docker ps -a --format '{{json .}}'")
+
+    containers = [json.loads(line) for line in stdout.splitlines()]
+
+    project_name = f"danswer-stack{run_suffix}"
+    containers_to_delete = [
+        c for c in containers if c["Names"].startswith(project_name)
+    ]
+
+    if not containers_to_delete:
+        print(f"No containers found for project: {project_name}")
+    else:
+        container_ids = " ".join([c["ID"] for c in containers_to_delete])
+        _run_command(f"docker rm -f {container_ids}")
+
+        print(
+            f"Successfully deleted {len(containers_to_delete)} containers for project: {project_name}"
+        )
+
+    stdout, _ = _run_command("docker volume ls --format '{{.Name}}'")
+
+    volumes = stdout.splitlines()
+
+    volumes_to_delete = [v for v in volumes if v.startswith(project_name)]
+
+    if not volumes_to_delete:
+        print(f"No volumes found for project: {project_name}")
+        return
+
+    # Delete filtered volumes
+    volume_names = " ".join(volumes_to_delete)
+    _run_command(f"docker volume rm {volume_names}")
+
+    print(
+        f"Successfully deleted {len(volumes_to_delete)} volumes for project: {project_name}"
+    )
+    stdout, _ = _run_command("docker network ls --format '{{.Name}}'")
+
+    networks = stdout.splitlines()
+
+    networks_to_delete = [n for n in networks if run_suffix in n]
+
+    if not networks_to_delete:
+        print(f"No networks found containing suffix: {run_suffix}")
+    else:
+        network_names = " ".join(networks_to_delete)
+        _run_command(f"docker network rm {network_names}")
+
+        print(
+            f"Successfully deleted {len(networks_to_delete)} networks containing suffix: {run_suffix}"
+        )
+
+
+@retry(tries=5, delay=5, backoff=2)
+def get_api_server_host_port(suffix: str) -> str:
+    """
+    This pulls all containers with the provided suffix
+    It then grabs the JSON specific container with a name containing "api_server"
+    It then grabs the port info from the JSON and strips out the relevent data
+    """
+    container_name = "api_server"
+
+    stdout, _ = _run_command("docker ps -a --format '{{json .}}'")
+    containers = [json.loads(line) for line in stdout.splitlines()]
+    server_jsons = []
+
+    for container in containers:
+        if container_name in container["Names"] and suffix in container["Names"]:
+            server_jsons.append(container)
+
+    if not server_jsons:
+        raise RuntimeError(
+            f"No container found containing: {container_name} and {suffix}"
+        )
+    elif len(server_jsons) > 1:
+        raise RuntimeError(
+            f"Too many containers matching {container_name} found, please indicate a suffix"
+        )
+    server_json = server_jsons[0]
+
+    # This is in case the api_server has multiple ports
+    client_port = "8080"
+    ports = server_json.get("Ports", "")
+    port_infos = ports.split(",") if ports else []
+    port_dict = {}
+    for port_info in port_infos:
+        port_arr = port_info.split(":")[-1].split("->") if port_info else []
+        if len(port_arr) == 2:
+            port_dict[port_arr[1]] = port_arr[0]
+
+    # Find the host port where client_port is in the key
+    matching_ports = [value for key, value in port_dict.items() if client_port in key]
+
+    if len(matching_ports) > 1:
+        raise RuntimeError(f"Too many ports matching {client_port} found")
+    if not matching_ports:
+        raise RuntimeError(
+            f"No port found containing: {client_port} for container: {container_name} and suffix: {suffix}"
+        )
+    return matching_ports[0]
--- a/backend/tests/regression/answer_quality/file_uploader.py
+++ b/backend/tests/regression/answer_quality/file_uploader.py
@@ -0,0 +1,31 @@
+import os
+from types import SimpleNamespace
+
+import yaml
+
+from tests.regression.answer_quality.api_utils import create_cc_pair
+from tests.regression.answer_quality.api_utils import create_connector
+from tests.regression.answer_quality.api_utils import create_credential
+from tests.regression.answer_quality.api_utils import run_cc_once
+from tests.regression.answer_quality.api_utils import upload_file
+
+
+def upload_test_files(zip_file_path: str, run_suffix: str) -> None:
+    print("zip:", zip_file_path)
+    file_paths = upload_file(run_suffix, zip_file_path)
+
+    conn_id = create_connector(run_suffix, file_paths)
+    cred_id = create_credential(run_suffix)
+
+    create_cc_pair(run_suffix, conn_id, cred_id)
+    run_cc_once(run_suffix, conn_id, cred_id)
+
+
+if __name__ == "__main__":
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(current_dir, "search_test_config.yaml")
+    with open(config_path, "r") as file:
+        config = SimpleNamespace(**yaml.safe_load(file))
+    file_location = config.zipped_documents_file
+    run_suffix = config.existing_test_suffix
+    upload_test_files(file_location, run_suffix)
--- a/backend/tests/regression/answer_quality/relari.py
+++ b/backend/tests/regression/answer_quality/relari.py
@@ -1,138 +1,108 @@
-import argparse
 import json
+import os
+import time
+from types import SimpleNamespace

-from sqlalchemy.orm import Session
+import yaml

-from danswer.configs.constants import MessageType
-from danswer.db.engine import get_sqlalchemy_engine
-from danswer.one_shot_answer.answer_question import get_search_answer
-from danswer.one_shot_answer.models import DirectQARequest
-from danswer.one_shot_answer.models import OneShotQAResponse
-from danswer.one_shot_answer.models import ThreadMessage
-from danswer.search.models import IndexFilters
-from danswer.search.models import OptionalSearchSetting
-from danswer.search.models import RetrievalDetails
+from tests.regression.answer_quality.api_utils import check_if_query_ready
+from tests.regression.answer_quality.api_utils import get_answer_from_query
+from tests.regression.answer_quality.cli_utils import get_current_commit_sha


-def get_answer_for_question(query: str, db_session: Session) -> OneShotQAResponse:
-    filters = IndexFilters(
-        source_type=None,
-        document_set=None,
-        time_cutoff=None,
-        tags=None,
-        access_control_list=None,
-    )
+def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]:
+    while not check_if_query_ready(run_suffix):
+        time.sleep(5)

-    messages = [ThreadMessage(message=query, sender=None, role=MessageType.USER)]
-
-    new_message_request = DirectQARequest(
-        messages=messages,
-        prompt_id=0,
-        persona_id=0,
-        retrieval_options=RetrievalDetails(
-            run_search=OptionalSearchSetting.ALWAYS,
-            real_time=True,
-            filters=filters,
-            enable_auto_detect_filters=False,
-        ),
-        chain_of_thought=False,
-        return_contexts=True,
-    )
-
-    answer = get_search_answer(
-        query_req=new_message_request,
-        user=None,
-        max_document_tokens=None,
-        max_history_tokens=None,
-        db_session=db_session,
-        answer_generation_timeout=100,
-        enable_reflexion=False,
-        bypass_acl=True,
-    )
-
-    return answer
-
-
-def read_questions(questions_file_path: str) -> list[dict]:
-    samples = []
-    with open(questions_file_path, "r", encoding="utf-8") as file:
-        for line in file:
-            sample = json.loads(line.strip())
-            samples.append(sample)
-    return samples
-
-
-def get_relari_outputs(samples: list[dict]) -> list[dict]:
    relari_outputs = []
-    with Session(get_sqlalchemy_engine(), expire_on_commit=False) as db_session:
-        for sample in samples:
-            answer = get_answer_for_question(
-                query=sample["question"], db_session=db_session
-            )
-            assert answer.contexts
+    for sample in samples:
+        retrieved_context, answer = get_answer_from_query(
+            query=sample["question"],
+            run_suffix=run_suffix,
+        )

-            relari_outputs.append(
-                {
-                    "label": sample["uid"],
-                    "question": sample["question"],
-                    "answer": answer.answer,
-                    "retrieved_context": [
-                        context.content for context in answer.contexts.contexts
-                    ],
-                }
-            )
+        relari_outputs.append(
+            {
+                "label": sample["uid"],
+                "question": sample["question"],
+                "answer": answer,
+                "retrieved_context": retrieved_context,
+            }
+        )

    return relari_outputs


-def write_output_file(relari_outputs: list[dict], output_file: str) -> None:
-    with open(output_file, "w", encoding="utf-8") as file:
+def _write_output_file(
+    relari_outputs: list[dict], output_folder_path: str, run_suffix: str
+) -> None:
+    metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix}
+
+    counter = 1
+    output_file_path = os.path.join(output_folder_path, "results.txt")
+    metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml")
+    while os.path.exists(output_file_path) or os.path.exists(metadata_file_path):
+        output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt")
+        metadata_file_path = os.path.join(
+            output_folder_path, f"run_metadata_{counter}.txt"
+        )
+        counter += 1
+    print("saving question results to:", output_file_path)
+    print("saving metadata to:", metadata_file_path)
+    with open(metadata_file_path, "w", encoding="utf-8") as yaml_file:
+        yaml.dump(metadata, yaml_file)
+    with open(output_file_path, "w", encoding="utf-8") as file:
        for output in relari_outputs:
            file.write(json.dumps(output) + "\n")
+            file.flush()


-def main(questions_file: str, output_file: str, limit: int | None = None) -> None:
-    samples = read_questions(questions_file)
+def _read_questions_jsonl(questions_file_path: str) -> list[dict]:
+    questions = []
+    with open(questions_file_path, "r") as file:
+        for line in file:
+            json_obj = json.loads(line)
+            questions.append(json_obj)
+    return questions
+
+
+def answer_relari_questions(
+    questions_file_path: str,
+    results_folder_path: str,
+    run_suffix: str,
+    limit: int | None = None,
+) -> None:
+    samples = _read_questions_jsonl(questions_file_path)

    if limit is not None:
        samples = samples[:limit]

-    # Use to be in this format but has since changed
-    # response_dict = {
-    #     "question": sample["question"],
-    #     "retrieved_contexts": [
-    #         context.content for context in answer.contexts.contexts
-    #     ],
-    #     "ground_truth_contexts": sample["ground_truth_contexts"],
-    #     "answer": answer.answer,
-    #     "ground_truths": sample["ground_truths"],
-    # }
+    relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix)

-    relari_outputs = get_relari_outputs(samples=samples)
+    _write_output_file(relari_outputs, results_folder_path, run_suffix)

-    write_output_file(relari_outputs, output_file)
+
+def main() -> None:
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(current_dir, "search_test_config.yaml")
+    with open(config_path, "r") as file:
+        config = SimpleNamespace(**yaml.safe_load(file))
+
+    current_output_folder = os.path.expanduser(config.output_folder)
+    if config.existing_test_suffix:
+        current_output_folder = os.path.join(
+            current_output_folder, "test" + config.existing_test_suffix, "relari_output"
+        )
+    else:
+        current_output_folder = os.path.join(current_output_folder, "no_defined_suffix")
+
+    answer_relari_questions(
+        config.questions_file,
+        current_output_folder,
+        config.existing_test_suffix,
+        config.limit,
+    )


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--questions_file",
-        type=str,
-        help="Path to the Relari questions file.",
-        default="./tests/regression/answer_quality/combined_golden_dataset.jsonl",
-    )
-    parser.add_argument(
-        "--output_file",
-        type=str,
-        help="Path to the output results file.",
-        default="./tests/regression/answer_quality/relari_results.txt",
-    )
-    parser.add_argument(
-        "--limit",
-        type=int,
-        default=None,
-        help="Limit the number of examples to process.",
-    )
-    args = parser.parse_args()
-
-    main(args.questions_file, args.output_file, args.limit)
+    main()
--- a/backend/tests/regression/answer_quality/search_quality_test.py
+++ b/backend/tests/regression/answer_quality/search_quality_test.py
@@ -0,0 +1,58 @@
+import os
+from datetime import datetime
+from types import SimpleNamespace
+
+import yaml
+
+from tests.regression.answer_quality.cli_utils import cleanup_docker
+from tests.regression.answer_quality.cli_utils import manage_data_directories
+from tests.regression.answer_quality.cli_utils import set_env_variables
+from tests.regression.answer_quality.cli_utils import start_docker_compose
+from tests.regression.answer_quality.cli_utils import switch_to_branch
+from tests.regression.answer_quality.file_uploader import upload_test_files
+from tests.regression.answer_quality.relari import answer_relari_questions
+
+
+def load_config(config_filename: str) -> SimpleNamespace:
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(current_dir, config_filename)
+    with open(config_path, "r") as file:
+        return SimpleNamespace(**yaml.safe_load(file))
+
+
+def main() -> None:
+    config = load_config("search_test_config.yaml")
+    if config.existing_test_suffix:
+        run_suffix = config.existing_test_suffix
+        print("launching danswer with existing data suffix:", run_suffix)
+    else:
+        run_suffix = datetime.now().strftime("_%Y%m%d_%H%M%S")
+        print("run_suffix:", run_suffix)
+
+    set_env_variables(
+        config.model_server_ip,
+        config.model_server_port,
+        config.use_cloud_gpu,
+        config.llm,
+    )
+    relari_output_folder_path = manage_data_directories(
+        run_suffix, config.output_folder, config.use_cloud_gpu
+    )
+    if config.branch:
+        switch_to_branch(config.branch)
+
+    start_docker_compose(run_suffix, config.launch_web_ui, config.use_cloud_gpu)
+
+    if not config.existing_test_suffix:
+        upload_test_files(config.zipped_documents_file, run_suffix)
+
+        answer_relari_questions(
+            config.questions_file, relari_output_folder_path, run_suffix, config.limit
+        )
+
+    if config.clean_up_docker_containers:
+        cleanup_docker(run_suffix)
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/tests/regression/answer_quality/search_test_config.yaml
+++ b/backend/tests/regression/answer_quality/search_test_config.yaml
@@ -0,0 +1,49 @@
+# Directory where test results will be saved
+output_folder: "~/danswer_test_results"
+
+# Path to the zip file containing sample documents
+zipped_documents_file: "~/sampledocs.zip"
+
+# Path to the YAML file containing sample questions
+questions_file: "~/sample_questions.yaml"
+
+# Git branch to use (null means use current branch as is)
+branch: null
+
+# Whether to remove Docker containers after the test
+clean_up_docker_containers: true
+
+# Whether to launch a web UI for the test
+launch_web_ui: false
+
+# Whether to use a cloud GPU for processing
+use_cloud_gpu: false
+
+# IP address of the model server (placeholder)
+model_server_ip: "PUT_PUBLIC_CLOUD_IP_HERE"
+
+# Port of the model server (placeholder)
+model_server_port: "PUT_PUBLIC_CLOUD_PORT_HERE"
+
+# Suffix for existing test results (empty string means no suffix)
+existing_test_suffix: ""
+
+# Limit on number of tests to run (null means no limit)
+limit: null
+
+# LLM configuration
+llm:
+  # Name of the LLM
+  name: "llm_name"
+  
+  # Provider of the LLM (e.g., OpenAI)
+  provider: "openai"
+  
+  # API key
+  api_key: "PUT_API_KEY_HERE"
+  
+  # Default model name to use
+  default_model_name: "gpt-4o"
+  
+  # List of model names to use for testing
+  model_names: ["gpt-4o"]