writing data

This commit is contained in:
joachim-danswer 2025-03-16 12:40:09 -07:00
parent 83d5b3b503
commit ab11bf6552

View File

@ -1,9 +1,12 @@
import csv
import json
import os
import string
from collections.abc import Callable
from collections.abc import Mapping
from datetime import datetime
from datetime import timezone
from pathlib import Path
from typing import Any
from typing import cast
@ -285,15 +288,61 @@ def parallel_visit_api_retrieval(
return inference_chunks
@retry(tries=3, delay=1, backoff=2)
def _append_ranking_stats_to_csv(
ranking_stats: list[tuple[str, float, str, str, str, float]],
csv_path: str = "/tmp/ranking_stats.csv",
) -> None:
"""
Append ranking statistics to a CSV file.
Args:
ranking_stats: List of tuples containing (query, hit_position, document_id)
csv_path: Path to the CSV file to append to
"""
file_exists = os.path.isfile(csv_path)
# Create directory if it doesn't exist
csv_dir = os.path.dirname(csv_path)
if csv_dir and not os.path.exists(csv_dir):
Path(csv_dir).mkdir(parents=True, exist_ok=True)
with open(csv_path, mode="a", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
# Write header if file is new
if not file_exists:
writer.writerow(
["query_alpha", "query", "hit_position", "document_id", "relevance"]
)
# Write the ranking stats
for cat, query_alpha, query, hit_pos, doc_chunk_id, relevance in ranking_stats:
writer.writerow([cat, query_alpha, query, hit_pos, doc_chunk_id, relevance])
logger.debug(f"Appended {len(ranking_stats)} ranking stats to {csv_path}")
@retry(tries=1, delay=1, backoff=2)
def query_vespa(
query_params: Mapping[str, str | int | float]
) -> list[InferenceChunkUncleaned]:
if "query" in query_params and not cast(str, query_params["query"]).strip():
raise ValueError("No/empty query received")
ranking_stats: list[tuple[str, float, str, str, str, float]] = []
search_time = 0.0
for query_alpha in [0.4, 0.7, 1.0]:
date_time_start = datetime.now()
# Create a mutable copy of the query_params
mutable_params = dict(query_params)
# Now we can modify it without mypy errors
mutable_params["input.query(alpha)"] = query_alpha
params = dict(
**query_params,
**mutable_params,
**{
"presentation.timing": True,
}
@ -342,6 +391,37 @@ def query_vespa(
f"fetch this document"
)
for hit_pos, hit in enumerate(hits):
ranking_stats.append(
(
"Retrieval",
query_alpha,
cast(str, mutable_params["query"]),
str(hit_pos),
hit["fields"].get("document_id", "")
+ "__"
+ str(hit["fields"].get("chunk_id", "")),
hit.get("relevance", 0),
)
)
date_time_end = datetime.now()
search_time += (date_time_end - date_time_start).microseconds / 1000000
ranking_stats.append(
(
"Timing",
query_alpha,
cast(str, query_params["query"]).strip(),
"",
"",
search_time,
)
)
if ranking_stats:
_append_ranking_stats_to_csv(ranking_stats)
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]