Files
Rei Meguro 9dbe12cea8 Feat: Search Eval Testing Overhaul (provide ground truth, categorize query, etc.) (#4739)
* fix: autoflake & import order

* docs: readme

* fix: mypy

* feat: eval

* docs: readme

* fix: oops forgot to remove comment

* fix: typo

* fix: rename var

* updated default config

* fix: config issue

* oops

* fix: black

* fix: eval and config

* feat: non tool calling query mod
2025-05-21 19:25:10 +00:00

95 lines
3.1 KiB
Python

from typing import Optional
from pydantic import BaseModel
from sqlalchemy.orm import Session
from onyx.context.search.models import InferenceChunk
from onyx.db.models import Document
from onyx.utils.logger import setup_logger
from tests.regression.search_quality.util_retrieve import group_by_documents
logger = setup_logger(__name__)
class Metrics(BaseModel):
# computed if ground truth is provided
ground_truth_ratio_topk: Optional[float] = None
ground_truth_avg_rank_delta: Optional[float] = None
# computed if reranked results are provided
soft_truth_ratio_topk: Optional[float] = None
soft_truth_avg_rank_delta: Optional[float] = None
metric_names = list(Metrics.model_fields.keys())
def get_corresponding_document(
doc_link: str, db_session: Session
) -> Optional[Document]:
"""Get the corresponding document from the database."""
doc_filter = db_session.query(Document).filter(Document.link == doc_link)
count = doc_filter.count()
if count == 0:
logger.warning(f"Could not find document with link {doc_link}, ignoring")
return None
if count > 1:
logger.warning(f"Found multiple documents with link {doc_link}, using first")
return doc_filter.first()
def evaluate_one_query(
search_chunks: list[InferenceChunk],
rerank_chunks: list[InferenceChunk],
true_documents: list[Document],
topk: int,
) -> Metrics:
"""Computes metrics for the search results, relative to the ground truth and reranked results."""
metrics_dict: dict[str, float] = {}
search_documents = group_by_documents(search_chunks)
search_ranks = {docid: rank for rank, docid in enumerate(search_documents)}
search_ranks_topk = {
docid: rank for rank, docid in enumerate(search_documents[:topk])
}
true_ranks = {doc.id: rank for rank, doc in enumerate(true_documents)}
if true_documents:
metrics_dict["ground_truth_ratio_topk"] = _compute_ratio(
search_ranks_topk, true_ranks
)
metrics_dict["ground_truth_avg_rank_delta"] = _compute_avg_rank_delta(
search_ranks, true_ranks
)
if rerank_chunks:
# build soft truth out of ground truth + reranked results, up to topk
soft_ranks = true_ranks
for docid in group_by_documents(rerank_chunks):
if len(soft_ranks) >= topk:
break
if docid not in soft_ranks:
soft_ranks[docid] = len(soft_ranks)
metrics_dict["soft_truth_ratio_topk"] = _compute_ratio(
search_ranks_topk, soft_ranks
)
metrics_dict["soft_truth_avg_rank_delta"] = _compute_avg_rank_delta(
search_ranks, soft_ranks
)
return Metrics(**metrics_dict)
def _compute_ratio(search_ranks: dict[str, int], true_ranks: dict[str, int]) -> float:
return len(set(search_ranks) & set(true_ranks)) / len(true_ranks)
def _compute_avg_rank_delta(
search_ranks: dict[str, int], true_ranks: dict[str, int]
) -> float:
out = len(search_ranks)
return sum(
abs(search_ranks.get(docid, out) - rank) for docid, rank in true_ranks.items()
) / len(true_ranks)