import numpy as np from onyx.agents.agent_search.shared_graph_utils.models import AnswerGenerationDocuments from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitScoreMetrics from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitStats from onyx.agents.agent_search.shared_graph_utils.operators import ( dedup_inference_section_list, ) from onyx.chat.models import SectionRelevancePiece from onyx.context.search.models import InferenceSection from onyx.utils.logger import setup_logger logger = setup_logger() def unique_chunk_id(doc: InferenceSection) -> str: return f"{doc.center_chunk.document_id}_{doc.center_chunk.chunk_id}" def calculate_rank_shift(list1: list, list2: list, top_n: int = 20) -> float: shift = 0 for rank_first, doc_id in enumerate(list1[:top_n], 1): try: rank_second = list2.index(doc_id) + 1 except ValueError: rank_second = len(list2) # Document not found in second list shift += np.abs(rank_first - rank_second) / np.log(1 + rank_first * rank_second) return shift / top_n def get_fit_scores( pre_reranked_results: list[InferenceSection], post_reranked_results: list[InferenceSection] | list[SectionRelevancePiece], ) -> RetrievalFitStats | None: """ Calculate retrieval metrics for search purposes """ if len(pre_reranked_results) == 0 or len(post_reranked_results) == 0: return None ranked_sections = { "initial": pre_reranked_results, "reranked": post_reranked_results, } fit_eval: RetrievalFitStats = RetrievalFitStats( fit_score_lift=0, rerank_effect=0, fit_scores={ "initial": RetrievalFitScoreMetrics(scores={}, chunk_ids=[]), "reranked": RetrievalFitScoreMetrics(scores={}, chunk_ids=[]), }, ) for rank_type, docs in ranked_sections.items(): logger.debug(f"rank_type: {rank_type}") for i in [1, 5, 10]: fit_eval.fit_scores[rank_type].scores[str(i)] = ( sum( [ float(doc.center_chunk.score) for doc in docs[:i] if type(doc) == InferenceSection and doc.center_chunk.score is not None ] ) / i ) fit_eval.fit_scores[rank_type].scores["fit_score"] = ( 1 / 3 * ( fit_eval.fit_scores[rank_type].scores["1"] + fit_eval.fit_scores[rank_type].scores["5"] + fit_eval.fit_scores[rank_type].scores["10"] ) ) fit_eval.fit_scores[rank_type].scores["fit_score"] = fit_eval.fit_scores[ rank_type ].scores["1"] fit_eval.fit_scores[rank_type].chunk_ids = [ unique_chunk_id(doc) for doc in docs if type(doc) == InferenceSection ] fit_eval.fit_score_lift = ( fit_eval.fit_scores["reranked"].scores["fit_score"] / fit_eval.fit_scores["initial"].scores["fit_score"] ) fit_eval.rerank_effect = calculate_rank_shift( fit_eval.fit_scores["initial"].chunk_ids, fit_eval.fit_scores["reranked"].chunk_ids, ) return fit_eval def get_answer_generation_documents( relevant_docs: list[InferenceSection], context_documents: list[InferenceSection], original_question_docs: list[InferenceSection], max_docs: int, ) -> AnswerGenerationDocuments: """ Create a deduplicated list of documents to stream, prioritizing relevant docs. Args: relevant_docs: Primary documents to include context_documents: Additional context documents to append original_question_docs: Original question documents to append max_docs: Maximum number of documents to return Returns: List of deduplicated documents, limited to max_docs """ # get relevant_doc ids relevant_doc_ids = [doc.center_chunk.document_id for doc in relevant_docs] # Start with relevant docs or fallback to original question docs streaming_documents = relevant_docs.copy() # Use a set for O(1) lookups of document IDs seen_doc_ids = {doc.center_chunk.document_id for doc in streaming_documents} # Combine additional documents to check in one iteration additional_docs = context_documents + original_question_docs for doc_idx, doc in enumerate(additional_docs): doc_id = doc.center_chunk.document_id if doc_id not in seen_doc_ids: streaming_documents.append(doc) seen_doc_ids.add(doc_id) streaming_documents = dedup_inference_section_list(streaming_documents) relevant_streaming_docs = [ doc for doc in streaming_documents if doc.center_chunk.document_id in relevant_doc_ids ] relevant_streaming_docs = dedup_sort_inference_section_list(relevant_streaming_docs) additional_streaming_docs = [ doc for doc in streaming_documents if doc.center_chunk.document_id not in relevant_doc_ids ] additional_streaming_docs = dedup_sort_inference_section_list( additional_streaming_docs ) for doc in additional_streaming_docs: if doc.center_chunk.score: doc.center_chunk.score += -2.0 else: doc.center_chunk.score = -2.0 sorted_streaming_documents = relevant_streaming_docs + additional_streaming_docs return AnswerGenerationDocuments( streaming_documents=sorted_streaming_documents[:max_docs], context_documents=relevant_streaming_docs[:max_docs], ) def dedup_sort_inference_section_list( sections: list[InferenceSection], ) -> list[InferenceSection]: """Deduplicates InferenceSections by document_id and sorts by score. Args: sections: List of InferenceSections to deduplicate and sort Returns: Deduplicated list of InferenceSections sorted by score in descending order """ # dedupe/merge with existing framework sections = dedup_inference_section_list(sections) # Use dict to deduplicate by document_id, keeping highest scored version unique_sections: dict[str, InferenceSection] = {} for section in sections: doc_id = section.center_chunk.document_id if doc_id not in unique_sections: unique_sections[doc_id] = section continue # Keep version with higher score existing_score = unique_sections[doc_id].center_chunk.score or 0 new_score = section.center_chunk.score or 0 if new_score > existing_score: unique_sections[doc_id] = section # Sort by score in descending order, handling None scores sorted_sections = sorted( unique_sections.values(), key=lambda x: x.center_chunk.score or 0, reverse=True ) return sorted_sections