mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-12 14:12:53 +02:00
welcome to onyx
This commit is contained in:
138
backend/onyx/context/search/utils.py
Normal file
138
backend/onyx/context/search/utils.py
Normal file
@ -0,0 +1,138 @@
|
||||
from collections.abc import Sequence
|
||||
from typing import TypeVar
|
||||
|
||||
from onyx.chat.models import SectionRelevancePiece
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
from onyx.context.search.models import InferenceSection
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.context.search.models import SavedSearchDocWithContent
|
||||
from onyx.context.search.models import SearchDoc
|
||||
from onyx.db.models import SearchDoc as DBSearchDoc
|
||||
|
||||
|
||||
T = TypeVar(
|
||||
"T",
|
||||
InferenceSection,
|
||||
InferenceChunk,
|
||||
SearchDoc,
|
||||
SavedSearchDoc,
|
||||
SavedSearchDocWithContent,
|
||||
)
|
||||
|
||||
TSection = TypeVar(
|
||||
"TSection",
|
||||
InferenceSection,
|
||||
SearchDoc,
|
||||
SavedSearchDoc,
|
||||
SavedSearchDocWithContent,
|
||||
)
|
||||
|
||||
|
||||
def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
|
||||
seen_ids = set()
|
||||
deduped_items = []
|
||||
dropped_indices = []
|
||||
for index, item in enumerate(items):
|
||||
if isinstance(item, InferenceSection):
|
||||
document_id = item.center_chunk.document_id
|
||||
else:
|
||||
document_id = item.document_id
|
||||
|
||||
if document_id not in seen_ids:
|
||||
seen_ids.add(document_id)
|
||||
deduped_items.append(item)
|
||||
else:
|
||||
dropped_indices.append(index)
|
||||
return deduped_items, dropped_indices
|
||||
|
||||
|
||||
def relevant_sections_to_indices(
|
||||
relevance_sections: list[SectionRelevancePiece] | None, items: list[TSection]
|
||||
) -> list[int]:
|
||||
if not relevance_sections:
|
||||
return []
|
||||
|
||||
relevant_set = {
|
||||
(chunk.document_id, chunk.chunk_id)
|
||||
for chunk in relevance_sections
|
||||
if chunk.relevant
|
||||
}
|
||||
|
||||
return [
|
||||
index
|
||||
for index, item in enumerate(items)
|
||||
if (
|
||||
(
|
||||
isinstance(item, InferenceSection)
|
||||
and (item.center_chunk.document_id, item.center_chunk.chunk_id)
|
||||
in relevant_set
|
||||
)
|
||||
or (
|
||||
not isinstance(item, (InferenceSection))
|
||||
and (item.document_id, item.chunk_ind) in relevant_set
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def drop_llm_indices(
|
||||
llm_indices: list[int],
|
||||
search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
|
||||
dropped_indices: list[int],
|
||||
) -> list[int]:
|
||||
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
|
||||
if dropped_indices:
|
||||
llm_bools = [
|
||||
val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
|
||||
]
|
||||
return [i for i, val in enumerate(llm_bools) if val]
|
||||
|
||||
|
||||
def inference_section_from_chunks(
|
||||
center_chunk: InferenceChunk,
|
||||
chunks: list[InferenceChunk],
|
||||
) -> InferenceSection | None:
|
||||
if not chunks:
|
||||
return None
|
||||
|
||||
combined_content = "\n".join([chunk.content for chunk in chunks])
|
||||
|
||||
return InferenceSection(
|
||||
center_chunk=center_chunk,
|
||||
chunks=chunks,
|
||||
combined_content=combined_content,
|
||||
)
|
||||
|
||||
|
||||
def chunks_or_sections_to_search_docs(
|
||||
items: Sequence[InferenceChunk | InferenceSection] | None,
|
||||
) -> list[SearchDoc]:
|
||||
if not items:
|
||||
return []
|
||||
|
||||
search_docs = [
|
||||
SearchDoc(
|
||||
document_id=(
|
||||
chunk := item.center_chunk
|
||||
if isinstance(item, InferenceSection)
|
||||
else item
|
||||
).document_id,
|
||||
chunk_ind=chunk.chunk_id,
|
||||
semantic_identifier=chunk.semantic_identifier or "Unknown",
|
||||
link=chunk.source_links[0] if chunk.source_links else None,
|
||||
blurb=chunk.blurb,
|
||||
source_type=chunk.source_type,
|
||||
boost=chunk.boost,
|
||||
hidden=chunk.hidden,
|
||||
metadata=chunk.metadata,
|
||||
score=chunk.score,
|
||||
match_highlights=chunk.match_highlights,
|
||||
updated_at=chunk.updated_at,
|
||||
primary_owners=chunk.primary_owners,
|
||||
secondary_owners=chunk.secondary_owners,
|
||||
is_internet=False,
|
||||
)
|
||||
for item in items
|
||||
]
|
||||
|
||||
return search_docs
|
Reference in New Issue
Block a user