Eval Script Incremental Write (#1784)

This commit is contained in:
Yuhong Sun
2024-07-06 15:43:40 -07:00
committed by GitHub
parent d35d7ee833
commit f0888f2f61
5 changed files with 88 additions and 40 deletions

View File

@@ -15,8 +15,9 @@ This Python script automates the process of running search quality tests for a b
## Usage
1. Ensure you have the required dependencies installed.
2. Configure the `search_test_config.yaml` file with your settings.
3. Navigate to the answer_quality folder:
2. Configure the `search_test_config.yaml` file based on the `search_test_config.yaml.template` file.
3. Configure the `.env_eval` file with the correct environment variables.
4. Navigate to the answer_quality folder:
```
cd danswer/backend/tests/regression/answer_quality
```
@@ -63,6 +64,11 @@ Edit `search_test_config.yaml` to set:
- llm
Fill this out according to the normal LLM seeding
To restart the evaluation using a particular index, set the suffix and turn off clean_up_docker_containers.
This also will skip running the evaluation questions, in this case, the relari.py script can be run manually.
Docker daemon must be running for this to work.
Each script is able to be individually run to upload additional docs or run additional tests

View File

@@ -1,18 +1,51 @@
import json
import os
import subprocess
import sys
from threading import Thread
from typing import IO
from retry import retry
def _run_command(command: str) -> tuple[str, str]:
def _run_command(command: str, stream_output: bool = False) -> tuple[str, str]:
process = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
)
stdout, stderr = process.communicate()
stdout_lines: list[str] = []
stderr_lines: list[str] = []
def process_stream(stream: IO[str], lines: list[str]) -> None:
for line in stream:
lines.append(line)
if stream_output:
print(
line,
end="",
file=sys.stdout if stream == process.stdout else sys.stderr,
)
stdout_thread = Thread(target=process_stream, args=(process.stdout, stdout_lines))
stderr_thread = Thread(target=process_stream, args=(process.stderr, stderr_lines))
stdout_thread.start()
stderr_thread.start()
stdout_thread.join()
stderr_thread.join()
process.wait()
if process.returncode != 0:
raise RuntimeError(f"Command failed with error: {stderr.decode()}")
return stdout.decode(), stderr.decode()
raise RuntimeError(f"Command failed with error: {''.join(stderr_lines)}")
return "".join(stdout_lines), "".join(stderr_lines)
def get_current_commit_sha() -> str:
@@ -92,8 +125,8 @@ def start_docker_compose(
print("Docker Command:\n", command)
_run_command(command)
print("The Docker has been Composed :)")
_run_command(command, stream_output=True)
print("Containers have been launched")
def cleanup_docker(run_suffix: str) -> None:

View File

@@ -10,51 +10,40 @@ from tests.regression.answer_quality.api_utils import get_answer_from_query
from tests.regression.answer_quality.cli_utils import get_current_commit_sha
def _get_relari_outputs(samples: list[dict], run_suffix: str) -> list[dict]:
def _get_and_write_relari_outputs(
samples: list[dict], run_suffix: str, output_file_path: str
) -> None:
while not check_if_query_ready(run_suffix):
time.sleep(5)
relari_outputs = []
for sample in samples:
retrieved_context, answer = get_answer_from_query(
query=sample["question"],
run_suffix=run_suffix,
)
with open(output_file_path, "w", encoding="utf-8") as file:
for sample in samples:
retrieved_context, answer = get_answer_from_query(
query=sample["question"],
run_suffix=run_suffix,
)
relari_outputs.append(
{
if not answer:
print("NO ANSWER GIVEN FOR QUESTION:", sample["question"])
continue
output = {
"label": sample["uid"],
"question": sample["question"],
"answer": answer,
"retrieved_context": retrieved_context,
}
)
return relari_outputs
file.write(json.dumps(output) + "\n")
file.flush()
def _write_output_file(
relari_outputs: list[dict], output_folder_path: str, run_suffix: str
) -> None:
def _write_metadata_file(run_suffix: str, metadata_file_path: str) -> None:
metadata = {"commit_sha": get_current_commit_sha(), "run_suffix": run_suffix}
counter = 1
output_file_path = os.path.join(output_folder_path, "results.txt")
metadata_file_path = os.path.join(output_folder_path, "run_metadata.yaml")
while os.path.exists(output_file_path) or os.path.exists(metadata_file_path):
output_file_path = os.path.join(output_folder_path, f"results_{counter}.txt")
metadata_file_path = os.path.join(
output_folder_path, f"run_metadata_{counter}.txt"
)
counter += 1
print("saving question results to:", output_file_path)
print("saving metadata to:", metadata_file_path)
with open(metadata_file_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(metadata, yaml_file)
with open(output_file_path, "w", encoding="utf-8") as file:
for output in relari_outputs:
file.write(json.dumps(output) + "\n")
file.flush()
def _read_questions_jsonl(questions_file_path: str) -> list[dict]:
@@ -72,14 +61,32 @@ def answer_relari_questions(
run_suffix: str,
limit: int | None = None,
) -> None:
results_file = "run_results.jsonl"
metadata_file = "run_metadata.yaml"
samples = _read_questions_jsonl(questions_file_path)
if limit is not None:
samples = samples[:limit]
relari_outputs = _get_relari_outputs(samples=samples, run_suffix=run_suffix)
counter = 1
output_file_path = os.path.join(results_folder_path, results_file)
metadata_file_path = os.path.join(results_folder_path, metadata_file)
while os.path.exists(output_file_path):
output_file_path = os.path.join(
results_folder_path,
results_file.replace("run_results", f"run_results_{counter}"),
)
metadata_file_path = os.path.join(
results_folder_path,
metadata_file.replace("run_metadata", f"run_metadata_{counter}"),
)
counter += 1
_write_output_file(relari_outputs, results_folder_path, run_suffix)
print("saving question results to:", output_file_path)
_write_metadata_file(run_suffix, metadata_file_path)
_get_and_write_relari_outputs(
samples=samples, run_suffix=run_suffix, output_file_path=output_file_path
)
def main() -> None:

View File

@@ -37,7 +37,7 @@ limit: null
# LLM configuration
llm:
# Name of the LLM
name: "llm_name"
name: "default_test_llm"
# Provider of the LLM (e.g., OpenAI)
provider: "openai"

View File

@@ -23,6 +23,7 @@ services:
- VESPA_HOST=index
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
- ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
- ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
extra_hosts:
- "host.docker.internal:host-gateway"
@@ -52,6 +53,7 @@ services:
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-inference_model_server}
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
- INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server}
- ENV_SEED_CONFIGURATION=${ENV_SEED_CONFIGURATION:-}
- ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True
extra_hosts:
- "host.docker.internal:host-gateway"