mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-15 14:20:11 +02:00
139 lines
3.8 KiB
Python
139 lines
3.8 KiB
Python
from collections.abc import Sequence
|
|
from typing import TypeVar
|
|
|
|
from onyx.chat.models import SectionRelevancePiece
|
|
from onyx.context.search.models import InferenceChunk
|
|
from onyx.context.search.models import InferenceSection
|
|
from onyx.context.search.models import SavedSearchDoc
|
|
from onyx.context.search.models import SavedSearchDocWithContent
|
|
from onyx.context.search.models import SearchDoc
|
|
from onyx.db.models import SearchDoc as DBSearchDoc
|
|
|
|
|
|
T = TypeVar(
|
|
"T",
|
|
InferenceSection,
|
|
InferenceChunk,
|
|
SearchDoc,
|
|
SavedSearchDoc,
|
|
SavedSearchDocWithContent,
|
|
)
|
|
|
|
TSection = TypeVar(
|
|
"TSection",
|
|
InferenceSection,
|
|
SearchDoc,
|
|
SavedSearchDoc,
|
|
SavedSearchDocWithContent,
|
|
)
|
|
|
|
|
|
def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
|
|
seen_ids = set()
|
|
deduped_items = []
|
|
dropped_indices = []
|
|
for index, item in enumerate(items):
|
|
if isinstance(item, InferenceSection):
|
|
document_id = item.center_chunk.document_id
|
|
else:
|
|
document_id = item.document_id
|
|
|
|
if document_id not in seen_ids:
|
|
seen_ids.add(document_id)
|
|
deduped_items.append(item)
|
|
else:
|
|
dropped_indices.append(index)
|
|
return deduped_items, dropped_indices
|
|
|
|
|
|
def relevant_sections_to_indices(
|
|
relevance_sections: list[SectionRelevancePiece] | None, items: list[TSection]
|
|
) -> list[int]:
|
|
if not relevance_sections:
|
|
return []
|
|
|
|
relevant_set = {
|
|
(chunk.document_id, chunk.chunk_id)
|
|
for chunk in relevance_sections
|
|
if chunk.relevant
|
|
}
|
|
|
|
return [
|
|
index
|
|
for index, item in enumerate(items)
|
|
if (
|
|
(
|
|
isinstance(item, InferenceSection)
|
|
and (item.center_chunk.document_id, item.center_chunk.chunk_id)
|
|
in relevant_set
|
|
)
|
|
or (
|
|
not isinstance(item, (InferenceSection))
|
|
and (item.document_id, item.chunk_ind) in relevant_set
|
|
)
|
|
)
|
|
]
|
|
|
|
|
|
def drop_llm_indices(
|
|
llm_indices: list[int],
|
|
search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
|
|
dropped_indices: list[int],
|
|
) -> list[int]:
|
|
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
|
|
if dropped_indices:
|
|
llm_bools = [
|
|
val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
|
|
]
|
|
return [i for i, val in enumerate(llm_bools) if val]
|
|
|
|
|
|
def inference_section_from_chunks(
|
|
center_chunk: InferenceChunk,
|
|
chunks: list[InferenceChunk],
|
|
) -> InferenceSection | None:
|
|
if not chunks:
|
|
return None
|
|
|
|
combined_content = "\n".join([chunk.content for chunk in chunks])
|
|
|
|
return InferenceSection(
|
|
center_chunk=center_chunk,
|
|
chunks=chunks,
|
|
combined_content=combined_content,
|
|
)
|
|
|
|
|
|
def chunks_or_sections_to_search_docs(
|
|
items: Sequence[InferenceChunk | InferenceSection] | None,
|
|
) -> list[SearchDoc]:
|
|
if not items:
|
|
return []
|
|
|
|
search_docs = [
|
|
SearchDoc(
|
|
document_id=(
|
|
chunk := item.center_chunk
|
|
if isinstance(item, InferenceSection)
|
|
else item
|
|
).document_id,
|
|
chunk_ind=chunk.chunk_id,
|
|
semantic_identifier=chunk.semantic_identifier or "Unknown",
|
|
link=chunk.source_links[0] if chunk.source_links else None,
|
|
blurb=chunk.blurb,
|
|
source_type=chunk.source_type,
|
|
boost=chunk.boost,
|
|
hidden=chunk.hidden,
|
|
metadata=chunk.metadata,
|
|
score=chunk.score,
|
|
match_highlights=chunk.match_highlights,
|
|
updated_at=chunk.updated_at,
|
|
primary_owners=chunk.primary_owners,
|
|
secondary_owners=chunk.secondary_owners,
|
|
is_internet=False,
|
|
)
|
|
for item in items
|
|
]
|
|
|
|
return search_docs
|