Added search quality testing pipeline (#1774)

This commit is contained in:
hagen-danswer
2024-07-06 11:51:50 -07:00
committed by GitHub
parent de4d8e9a65
commit ac14369716
11 changed files with 1123 additions and 115 deletions

View File

@@ -0,0 +1,68 @@
# Search Quality Test Script
This Python script automates the process of running search quality tests for a backend system.
## Features
- Loads configuration from a YAML file
- Sets up Docker environment
- Manages environment variables
- Switches to specified Git branch
- Uploads test documents
- Runs search quality tests using Relari
- Cleans up Docker containers (optional)
## Usage
1. Ensure you have the required dependencies installed.
2. Configure the `search_test_config.yaml` file with your settings.
3. Navigate to the answer_quality folder:
```
cd danswer/backend/tests/regression/answer_quality
```
4. Run the script:
```
python search_quality_test.py
```
## Configuration
Edit `search_test_config.yaml` to set:
- output_folder
This is the folder where the folders for each test will go
These folders will contain the postgres/vespa data as well as the results for each test
- zipped_documents_file
The path to the zip file containing the files you'd like to test against
- questions_file
The path to the yaml containing the questions you'd like to test with
- branch
Set the branch to null if you want it to just use the code as is
- clean_up_docker_containers
Set this to true to automatically delete all docker containers, networks and volumes after the test
- launch_web_ui
Set this to true if you want to use the UI during/after the testing process
- use_cloud_gpu
Set to true or false depending on if you want to use the remote gpu
Only need to set this if use_cloud_gpu is true
- model_server_ip
This is the ip of the remote model server
Only need to set this if use_cloud_gpu is true
- model_server_port
This is the port of the remote model server
Only need to set this if use_cloud_gpu is true
- existing_test_suffix
Use this if you would like to relaunch a previous test instance
Input the suffix of the test you'd like to re-launch
(E.g. to use the data from folder "test_1234_5678" put "_1234_5678")
No new files will automatically be uploaded
Leave empty to run a new test
- limit
Max number of questions you'd like to ask against the dataset
Set to null for no limit
- llm
Fill this out according to the normal LLM seeding
Docker daemon must be running for this to work.
Each script is able to be individually run to upload additional docs or run additional tests

View File

@@ -0,0 +1,220 @@
import requests
from retry import retry
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import MessageType
from danswer.connectors.models import InputType
from danswer.db.enums import IndexingStatus
from danswer.one_shot_answer.models import DirectQARequest
from danswer.one_shot_answer.models import ThreadMessage
from danswer.search.models import IndexFilters
from danswer.search.models import OptionalSearchSetting
from danswer.search.models import RetrievalDetails
from danswer.server.documents.models import ConnectorBase
from tests.regression.answer_quality.cli_utils import (
get_api_server_host_port,
)
def _api_url_builder(run_suffix: str, api_path: str) -> str:
return f"http://localhost:{get_api_server_host_port(run_suffix)}" + api_path
@retry(tries=5, delay=2, backoff=2)
def get_answer_from_query(query: str, run_suffix: str) -> tuple[list[str], str]:
filters = IndexFilters(
source_type=None,
document_set=None,
time_cutoff=None,
tags=None,
access_control_list=None,
)
messages = [ThreadMessage(message=query, sender=None, role=MessageType.USER)]
new_message_request = DirectQARequest(
messages=messages,
prompt_id=0,
persona_id=0,
retrieval_options=RetrievalDetails(
run_search=OptionalSearchSetting.ALWAYS,
real_time=True,
filters=filters,
enable_auto_detect_filters=False,
),
chain_of_thought=False,
return_contexts=True,
)
url = _api_url_builder(run_suffix, "/query/answer-with-quote/")
headers = {
"Content-Type": "application/json",
}
body = new_message_request.dict()
body["user"] = None
try:
response_json = requests.post(url, headers=headers, json=body).json()
content_list = [
context.get("content", "")
for context in response_json.get("contexts", {}).get("contexts", [])
]
answer = response_json.get("answer")
except Exception as e:
print("Failed to answer the questions, trying again")
print(f"error: {str(e)}")
raise e
print("\nquery: ", query)
print("answer: ", answer)
print("content_list: ", content_list)
return content_list, answer
def check_if_query_ready(run_suffix: str) -> bool:
url = _api_url_builder(run_suffix, "/manage/admin/connector/indexing-status/")
headers = {
"Content-Type": "application/json",
}
indexing_status_dict = requests.get(url, headers=headers).json()
ongoing_index_attempts = False
doc_count = 0
for index_attempt in indexing_status_dict:
status = index_attempt["last_status"]
if status == IndexingStatus.IN_PROGRESS or status == IndexingStatus.NOT_STARTED:
ongoing_index_attempts = True
doc_count += index_attempt["docs_indexed"]
if not doc_count:
print("No docs indexed, waiting for indexing to start")
elif ongoing_index_attempts:
print(
f"{doc_count} docs indexed but waiting for ongoing indexing jobs to finish..."
)
return doc_count > 0 and not ongoing_index_attempts
def run_cc_once(run_suffix: str, connector_id: int, credential_id: int) -> None:
url = _api_url_builder(run_suffix, "/manage/admin/connector/run-once/")
headers = {
"Content-Type": "application/json",
}
body = {
"connector_id": connector_id,
"credential_ids": [credential_id],
"from_beginning": True,
}
print("body:", body)
response = requests.post(url, headers=headers, json=body)
if response.status_code == 200:
print("Connector created successfully:", response.json())
else:
print("Failed status_code:", response.status_code)
print("Failed text:", response.text)
def create_cc_pair(run_suffix: str, connector_id: int, credential_id: int) -> None:
url = _api_url_builder(
run_suffix, f"/manage/connector/{connector_id}/credential/{credential_id}"
)
headers = {
"Content-Type": "application/json",
}
body = {"name": "zip_folder_contents", "is_public": True}
print("body:", body)
response = requests.put(url, headers=headers, json=body)
if response.status_code == 200:
print("Connector created successfully:", response.json())
else:
print("Failed status_code:", response.status_code)
print("Failed text:", response.text)
def _get_existing_connector_names(run_suffix: str) -> list[str]:
url = _api_url_builder(run_suffix, "/manage/connector")
headers = {
"Content-Type": "application/json",
}
body = {
"credential_json": {},
"admin_public": True,
}
response = requests.get(url, headers=headers, json=body)
if response.status_code == 200:
connectors = response.json()
return [connector["name"] for connector in connectors]
else:
raise RuntimeError(response.__dict__)
def create_connector(run_suffix: str, file_paths: list[str]) -> int:
url = _api_url_builder(run_suffix, "/manage/admin/connector")
headers = {
"Content-Type": "application/json",
}
connector_name = base_connector_name = "search_eval_connector"
existing_connector_names = _get_existing_connector_names(run_suffix)
count = 1
while connector_name in existing_connector_names:
connector_name = base_connector_name + "_" + str(count)
count += 1
connector = ConnectorBase(
name=connector_name,
source=DocumentSource.FILE,
input_type=InputType.LOAD_STATE,
connector_specific_config={"file_locations": file_paths},
refresh_freq=None,
prune_freq=None,
disabled=False,
)
body = connector.dict()
print("body:", body)
response = requests.post(url, headers=headers, json=body)
if response.status_code == 200:
print("Connector created successfully:", response.json())
return response.json()["id"]
else:
raise RuntimeError(response.__dict__)
def create_credential(run_suffix: str) -> int:
url = _api_url_builder(run_suffix, "/manage/credential")
headers = {
"Content-Type": "application/json",
}
body = {
"credential_json": {},
"admin_public": True,
}
response = requests.post(url, headers=headers, json=body)
if response.status_code == 200:
print("credential created successfully:", response.json())
return response.json()["id"]
else:
raise RuntimeError(response.__dict__)
@retry(tries=10, delay=2, backoff=2)
def upload_file(run_suffix: str, zip_file_path: str) -> list[str]:
files = [
("files", open(zip_file_path, "rb")),
]
api_path = _api_url_builder(run_suffix, "/manage/admin/connector/file/upload")
try:
response = requests.post(api_path, files=files)
response.raise_for_status() # Raises an HTTPError for bad responses
print("file uploaded successfully:", response.json())
return response.json()["file_paths"]
except Exception as e:
print("File upload failed, waiting for API server to come up and trying again")
raise e

View File

@@ -0,0 +1,203 @@
import json
import os
import subprocess
from retry import retry
def _run_command(command: str) -> tuple[str, str]:
process = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if process.returncode != 0:
raise RuntimeError(f"Command failed with error: {stderr.decode()}")
return stdout.decode(), stderr.decode()
def get_current_commit_sha() -> str:
print("Getting current commit SHA...")
stdout, _ = _run_command("git rev-parse HEAD")
sha = stdout.strip()
print(f"Current commit SHA: {sha}")
return sha
def switch_to_branch(branch: str) -> None:
print(f"Switching to branch: {branch}...")
_run_command(f"git checkout {branch}")
_run_command("git pull")
print(f"Successfully switched to branch: {branch}")
print("Repository updated successfully.")
def manage_data_directories(suffix: str, base_path: str, use_cloud_gpu: bool) -> str:
# Use the user's home directory as the base path
target_path = os.path.join(os.path.expanduser(base_path), f"test{suffix}")
directories = {
"DANSWER_POSTGRES_DATA_DIR": os.path.join(target_path, "postgres/"),
"DANSWER_VESPA_DATA_DIR": os.path.join(target_path, "vespa/"),
}
if not use_cloud_gpu:
directories["DANSWER_INDEX_MODEL_CACHE_DIR"] = os.path.join(
target_path, "index_model_cache/"
)
directories["DANSWER_INFERENCE_MODEL_CACHE_DIR"] = os.path.join(
target_path, "inference_model_cache/"
)
# Create directories if they don't exist
for env_var, directory in directories.items():
os.makedirs(directory, exist_ok=True)
os.environ[env_var] = directory
print(f"Set {env_var} to: {directory}")
relari_output_path = os.path.join(target_path, "relari_output/")
os.makedirs(relari_output_path, exist_ok=True)
return relari_output_path
def set_env_variables(
remote_server_ip: str,
remote_server_port: str,
use_cloud_gpu: bool,
llm_config: dict,
) -> None:
env_vars: dict = {}
env_vars["ENV_SEED_CONFIGURATION"] = json.dumps({"llms": [llm_config]})
env_vars["ENABLE_PAID_ENTERPRISE_EDITION_FEATURES"] = "true"
if use_cloud_gpu:
env_vars["MODEL_SERVER_HOST"] = remote_server_ip
env_vars["MODEL_SERVER_PORT"] = remote_server_port
for env_var_name, env_var in env_vars.items():
os.environ[env_var_name] = env_var
print(f"Set {env_var_name} to: {env_var}")
def start_docker_compose(
run_suffix: str, launch_web_ui: bool, use_cloud_gpu: bool
) -> None:
print("Starting Docker Compose...")
os.chdir(os.path.expanduser("~/danswer/deployment/docker_compose"))
command = f"docker compose -f docker-compose.search-testing.yml -p danswer-stack{run_suffix} up -d"
command += " --build"
command += " --pull always"
command += " --force-recreate"
if not launch_web_ui:
command += " --scale web_server=0"
command += " --scale nginx=0"
if use_cloud_gpu:
command += " --scale indexing_model_server=0"
command += " --scale inference_model_server=0"
print("Docker Command:\n", command)
_run_command(command)
print("The Docker has been Composed :)")
def cleanup_docker(run_suffix: str) -> None:
print(
f"Deleting Docker containers, volumes, and networks for project suffix: {run_suffix}"
)
stdout, _ = _run_command("docker ps -a --format '{{json .}}'")
containers = [json.loads(line) for line in stdout.splitlines()]
project_name = f"danswer-stack{run_suffix}"
containers_to_delete = [
c for c in containers if c["Names"].startswith(project_name)
]
if not containers_to_delete:
print(f"No containers found for project: {project_name}")
else:
container_ids = " ".join([c["ID"] for c in containers_to_delete])
_run_command(f"docker rm -f {container_ids}")
print(
f"Successfully deleted {len(containers_to_delete)} containers for project: {project_name}"
)
stdout, _ = _run_command("docker volume ls --format '{{.Name}}'")
volumes = stdout.splitlines()
volumes_to_delete = [v for v in volumes if v.startswith(project_name)]
if not volumes_to_delete:
print(f"No volumes found for project: {project_name}")
return
# Delete filtered volumes
volume_names = " ".join(volumes_to_delete)
_run_command(f"docker volume rm {volume_names}")
print(
f"Successfully deleted {len(volumes_to_delete)} volumes for project: {project_name}"
)
stdout, _ = _run_command("docker network ls --format '{{.Name}}'")
networks = stdout.splitlines()
networks_to_delete = [n for n in networks if run_suffix in n]
if not networks_to_delete:
print(f"No networks found containing suffix: {run_suffix}")
else:
network_names = " ".join(networks_to_delete)
_run_command(f"docker network rm {network_names}")
print(
f"Successfully deleted {len(networks_to_delete)} networks containing suffix: {run_suffix}"
)
@retry(tries=5, delay=5, backoff=2)
def get_api_server_host_port(suffix: str) -> str:
"""
This pulls all containers with the provided suffix
It then grabs the JSON specific container with a name containing "api_server"
It then grabs the port info from the JSON and strips out the relevent data
"""
container_name = "api_server"
stdout, _ = _run_command("docker ps -a --format '{{json .}}'")
containers = [json.loads(line) for line in stdout.splitlines()]
server_jsons = []
for container in containers:
if container_name in container["Names"] and suffix in container["Names"]:
server_jsons.append(container)
if not server_jsons:
raise RuntimeError(
f"No container found containing: {container_name} and {suffix}"
)
elif len(server_jsons) > 1:
raise RuntimeError(
f"Too many containers matching {container_name} found, please indicate a suffix"
)
server_json = server_jsons[0]
# This is in case the api_server has multiple ports
client_port = "8080"
ports = server_json.get("Ports", "")
port_infos = ports.split(",") if ports else []
port_dict = {}
for port_info in port_infos:
port_arr = port_info.split(":")[-1].split("->") if port_info else []
if len(port_arr) == 2:
port_dict[port_arr[1]] = port_arr[0]
# Find the host port where client_port is in the key
matching_ports = [value for key, value in port_dict.items() if client_port in key]
if len(matching_ports) > 1:
raise RuntimeError(f"Too many ports matching {client_port} found")
if not matching_ports:
raise RuntimeError(
f"No port found containing: {client_port} for container: {container_name} and suffix: {suffix}"
)
return matching_ports[0]

View File

@@ -0,0 +1,31 @@
import os
from types import SimpleNamespace
import yaml
from tests.regression.answer_quality.api_utils import create_cc_pair
from tests.regression.answer_quality.api_utils import create_connector
from tests.regression.answer_quality.api_utils import create_credential
from tests.regression.answer_quality.api_utils import run_cc_once
from tests.regression.answer_quality.api_utils import upload_file
def upload_test_files(zip_file_path: str, run_suffix: str) -> None:
print("zip:", zip_file_path)
file_paths = upload_file(run_suffix, zip_file_path)
conn_id = create_connector(run_suffix, file_paths)
cred_id = create_credential(run_suffix)
create_cc_pair(run_suffix, conn_id, cred_id)
run_cc_once(run_suffix, conn_id, cred_id)
if __name__ == "__main__":
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_dir, "search_test_config.yaml")
with open(config_path, "r") as file:
config = SimpleNamespace(**yaml.safe_load(file))
file_location = config.zipped_documents_file
run_suffix = config.existing_test_suffix
upload_test_files(file_location, run_suffix)

View File

@@ -1,138 +1,108 @@
import argparse
import json
import os
import time
from types import SimpleNamespace
from sqlalchemy.orm import Session
import yaml
from danswer.configs.constants import MessageType
from danswer.db.engine import get_sqlalchemy_engine
from danswer.one_shot_answer.answer_question import get_search_answer
from danswer.one_shot_answer.models import DirectQARequest
from danswer.one_shot_answer.models import OneShotQAResponse
from danswer.one_shot_answer.models import ThreadMessage
from danswer.search.models import IndexFilters
from danswer.search.models import OptionalSearchSetting
from danswer.search.models import RetrievalDetails
from tests.regression.answer_quality.api_utils import check_if_query_ready
from tests.regression.answer_quality.api_utils import get_answer_from_query
from tests.regression.answer_quality.cli_utils import get_current_commit_sha
def get_answer_for_question(query: str, db_session: Session) -> OneShotQAResponse:
filters = IndexFilters(
source_type=None,
document_set=None,
time_cutoff=None,
tags=None,
access_control_list=None,
)
def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]:
while not check_if_query_ready(run_suffix):
time.sleep(5)
messages = [ThreadMessage(message=query, sender=None, role=MessageType.USER)]
new_message_request = DirectQARequest(
messages=messages,
prompt_id=0,
persona_id=0,
retrieval_options=RetrievalDetails(
run_search=OptionalSearchSetting.ALWAYS,
real_time=True,
filters=filters,
enable_auto_detect_filters=False,
),
chain_of_thought=False,
return_contexts=True,
)
answer = get_search_answer(
query_req=new_message_request,
user=None,
max_document_tokens=None,
max_history_tokens=None,
db_session=db_session,
answer_generation_timeout=100,
enable_reflexion=False,
bypass_acl=True,
)
return answer
def read_questions(questions_file_path: str) -> list[dict]:
samples = []
with open(questions_file_path, "r", encoding="utf-8") as file:
for line in file:
sample = json.loads(line.strip())
samples.append(sample)
return samples
def get_relari_outputs(samples: list[dict]) -> list[dict]:
relari_outputs = []
with Session(get_sqlalchemy_engine(), expire_on_commit=False) as db_session:
for sample in samples:
answer = get_answer_for_question(
query=sample["question"], db_session=db_session
)
assert answer.contexts
for sample in samples:
retrieved_context, answer = get_answer_from_query(
query=sample["question"],
run_suffix=run_suffix,
)
relari_outputs.append(
{
"label": sample["uid"],
"question": sample["question"],
"answer": answer.answer,
"retrieved_context": [
context.content for context in answer.contexts.contexts
],
}
)
relari_outputs.append(
{
"label": sample["uid"],
"question": sample["question"],
"answer": answer,
"retrieved_context": retrieved_context,
}
)
return relari_outputs
def write_output_file(relari_outputs: list[dict], output_file: str) -> None:
with open(output_file, "w", encoding="utf-8") as file:
def _write_output_file(
relari_outputs: list[dict], output_folder_path: str, run_suffix: str
) -> None:
metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix}
counter = 1
output_file_path = os.path.join(output_folder_path, "results.txt")
metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml")
while os.path.exists(output_file_path) or os.path.exists(metadata_file_path):
output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt")
metadata_file_path = os.path.join(
output_folder_path, f"run_metadata_{counter}.txt"
)
counter += 1
print("saving question results to:", output_file_path)
print("saving metadata to:", metadata_file_path)
with open(metadata_file_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(metadata, yaml_file)
with open(output_file_path, "w", encoding="utf-8") as file:
for output in relari_outputs:
file.write(json.dumps(output) + "\n")
file.flush()
def main(questions_file: str, output_file: str, limit: int | None = None) -> None:
samples = read_questions(questions_file)
def _read_questions_jsonl(questions_file_path: str) -> list[dict]:
questions = []
with open(questions_file_path, "r") as file:
for line in file:
json_obj = json.loads(line)
questions.append(json_obj)
return questions
def answer_relari_questions(
questions_file_path: str,
results_folder_path: str,
run_suffix: str,
limit: int | None = None,
) -> None:
samples = _read_questions_jsonl(questions_file_path)
if limit is not None:
samples = samples[:limit]
# Use to be in this format but has since changed
# response_dict = {
# "question": sample["question"],
# "retrieved_contexts": [
# context.content for context in answer.contexts.contexts
# ],
# "ground_truth_contexts": sample["ground_truth_contexts"],
# "answer": answer.answer,
# "ground_truths": sample["ground_truths"],
# }
relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix)
relari_outputs = get_relari_outputs(samples=samples)
_write_output_file(relari_outputs, results_folder_path, run_suffix)
write_output_file(relari_outputs, output_file)
def main() -> None:
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_dir, "search_test_config.yaml")
with open(config_path, "r") as file:
config = SimpleNamespace(**yaml.safe_load(file))
current_output_folder = os.path.expanduser(config.output_folder)
if config.existing_test_suffix:
current_output_folder = os.path.join(
current_output_folder, "test" + config.existing_test_suffix, "relari_output"
)
else:
current_output_folder = os.path.join(current_output_folder, "no_defined_suffix")
answer_relari_questions(
config.questions_file,
current_output_folder,
config.existing_test_suffix,
config.limit,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--questions_file",
type=str,
help="Path to the Relari questions file.",
default="./tests/regression/answer_quality/combined_golden_dataset.jsonl",
)
parser.add_argument(
"--output_file",
type=str,
help="Path to the output results file.",
default="./tests/regression/answer_quality/relari_results.txt",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit the number of examples to process.",
)
args = parser.parse_args()
main(args.questions_file, args.output_file, args.limit)
main()

View File

@@ -0,0 +1,58 @@
import os
from datetime import datetime
from types import SimpleNamespace
import yaml
from tests.regression.answer_quality.cli_utils import cleanup_docker
from tests.regression.answer_quality.cli_utils import manage_data_directories
from tests.regression.answer_quality.cli_utils import set_env_variables
from tests.regression.answer_quality.cli_utils import start_docker_compose
from tests.regression.answer_quality.cli_utils import switch_to_branch
from tests.regression.answer_quality.file_uploader import upload_test_files
from tests.regression.answer_quality.relari import answer_relari_questions
def load_config(config_filename: str) -> SimpleNamespace:
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_dir, config_filename)
with open(config_path, "r") as file:
return SimpleNamespace(**yaml.safe_load(file))
def main() -> None:
config = load_config("search_test_config.yaml")
if config.existing_test_suffix:
run_suffix = config.existing_test_suffix
print("launching danswer with existing data suffix:", run_suffix)
else:
run_suffix = datetime.now().strftime("_%Y%m%d_%H%M%S")
print("run_suffix:", run_suffix)
set_env_variables(
config.model_server_ip,
config.model_server_port,
config.use_cloud_gpu,
config.llm,
)
relari_output_folder_path = manage_data_directories(
run_suffix, config.output_folder, config.use_cloud_gpu
)
if config.branch:
switch_to_branch(config.branch)
start_docker_compose(run_suffix, config.launch_web_ui, config.use_cloud_gpu)
if not config.existing_test_suffix:
upload_test_files(config.zipped_documents_file, run_suffix)
answer_relari_questions(
config.questions_file, relari_output_folder_path, run_suffix, config.limit
)
if config.clean_up_docker_containers:
cleanup_docker(run_suffix)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,49 @@
# Directory where test results will be saved
output_folder: "~/danswer_test_results"
# Path to the zip file containing sample documents
zipped_documents_file: "~/sampledocs.zip"
# Path to the YAML file containing sample questions
questions_file: "~/sample_questions.yaml"
# Git branch to use (null means use current branch as is)
branch: null
# Whether to remove Docker containers after the test
clean_up_docker_containers: true
# Whether to launch a web UI for the test
launch_web_ui: false
# Whether to use a cloud GPU for processing
use_cloud_gpu: false
# IP address of the model server (placeholder)
model_server_ip: "PUT_PUBLIC_CLOUD_IP_HERE"
# Port of the model server (placeholder)
model_server_port: "PUT_PUBLIC_CLOUD_PORT_HERE"
# Suffix for existing test results (empty string means no suffix)
existing_test_suffix: ""
# Limit on number of tests to run (null means no limit)
limit: null
# LLM configuration
llm:
# Name of the LLM
name: "llm_name"
# Provider of the LLM (e.g., OpenAI)
provider: "openai"
# API key
api_key: "PUT_API_KEY_HERE"
# Default model name to use
default_model_name: "gpt-4o"
# List of model names to use for testing
model_names: ["gpt-4o"]