2024-12-13 09:56:10 -08:00

139 lines
3.8 KiB
Python

from collections.abc import Sequence
from typing import TypeVar
from onyx.chat.models import SectionRelevancePiece
from onyx.context.search.models import InferenceChunk
from onyx.context.search.models import InferenceSection
from onyx.context.search.models import SavedSearchDoc
from onyx.context.search.models import SavedSearchDocWithContent
from onyx.context.search.models import SearchDoc
from onyx.db.models import SearchDoc as DBSearchDoc
T = TypeVar(
"T",
InferenceSection,
InferenceChunk,
SearchDoc,
SavedSearchDoc,
SavedSearchDocWithContent,
)
TSection = TypeVar(
"TSection",
InferenceSection,
SearchDoc,
SavedSearchDoc,
SavedSearchDocWithContent,
)
def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
seen_ids = set()
deduped_items = []
dropped_indices = []
for index, item in enumerate(items):
if isinstance(item, InferenceSection):
document_id = item.center_chunk.document_id
else:
document_id = item.document_id
if document_id not in seen_ids:
seen_ids.add(document_id)
deduped_items.append(item)
else:
dropped_indices.append(index)
return deduped_items, dropped_indices
def relevant_sections_to_indices(
relevance_sections: list[SectionRelevancePiece] | None, items: list[TSection]
) -> list[int]:
if not relevance_sections:
return []
relevant_set = {
(chunk.document_id, chunk.chunk_id)
for chunk in relevance_sections
if chunk.relevant
}
return [
index
for index, item in enumerate(items)
if (
(
isinstance(item, InferenceSection)
and (item.center_chunk.document_id, item.center_chunk.chunk_id)
in relevant_set
)
or (
not isinstance(item, (InferenceSection))
and (item.document_id, item.chunk_ind) in relevant_set
)
)
]
def drop_llm_indices(
llm_indices: list[int],
search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
dropped_indices: list[int],
) -> list[int]:
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
if dropped_indices:
llm_bools = [
val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
]
return [i for i, val in enumerate(llm_bools) if val]
def inference_section_from_chunks(
center_chunk: InferenceChunk,
chunks: list[InferenceChunk],
) -> InferenceSection | None:
if not chunks:
return None
combined_content = "\n".join([chunk.content for chunk in chunks])
return InferenceSection(
center_chunk=center_chunk,
chunks=chunks,
combined_content=combined_content,
)
def chunks_or_sections_to_search_docs(
items: Sequence[InferenceChunk | InferenceSection] | None,
) -> list[SearchDoc]:
if not items:
return []
search_docs = [
SearchDoc(
document_id=(
chunk := item.center_chunk
if isinstance(item, InferenceSection)
else item
).document_id,
chunk_ind=chunk.chunk_id,
semantic_identifier=chunk.semantic_identifier or "Unknown",
link=chunk.source_links[0] if chunk.source_links else None,
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
metadata=chunk.metadata,
score=chunk.score,
match_highlights=chunk.match_highlights,
updated_at=chunk.updated_at,
primary_owners=chunk.primary_owners,
secondary_owners=chunk.secondary_owners,
is_internet=False,
)
for item in items
]
return search_docs