from typing import TYPE_CHECKING from pydantic import BaseModel from pydantic import Field from onyx.access.models import DocumentAccess from onyx.connectors.models import Document from onyx.db.enums import EmbeddingPrecision from onyx.utils.logger import setup_logger from shared_configs.enums import EmbeddingProvider from shared_configs.model_server_models import Embedding if TYPE_CHECKING: from onyx.db.models import SearchSettings logger = setup_logger() class ChunkEmbedding(BaseModel): full_embedding: Embedding mini_chunk_embeddings: list[Embedding] class BaseChunk(BaseModel): chunk_id: int # The first sentence(s) of the first Section of the chunk blurb: str content: str # Holds the link and the offsets into the raw Chunk text source_links: dict[int, str] | None image_file_name: str | None # True if this Chunk's start is not at the start of a Section section_continuation: bool class DocAwareChunk(BaseChunk): # During indexing flow, we have access to a complete "Document" # During inference we only have access to the document id and do not reconstruct the Document source_document: Document # This could be an empty string if the title is too long and taking up too much of the chunk # This does not mean necessarily that the document does not have a title title_prefix: str # During indexing we also (optionally) build a metadata string from the metadata dict # This is also indexed so that we can strip it out after indexing, this way it supports # multiple iterations of metadata representation for backwards compatibility metadata_suffix_semantic: str metadata_suffix_keyword: str mini_chunk_texts: list[str] | None large_chunk_id: int | None large_chunk_reference_ids: list[int] = Field(default_factory=list) def to_short_descriptor(self) -> str: """Used when logging the identity of a chunk""" return f"{self.source_document.to_short_descriptor()} Chunk ID: {self.chunk_id}" def get_link(self) -> str | None: return ( self.source_document.sections[0].link if self.source_document.sections else None ) class IndexChunk(DocAwareChunk): embeddings: ChunkEmbedding title_embedding: Embedding | None # TODO(rkuo): currently, this extra metadata sent during indexing is just for speed, # but full consistency happens on background sync class DocMetadataAwareIndexChunk(IndexChunk): """An `IndexChunk` that contains all necessary metadata to be indexed. This includes the following: access: holds all information about which users should have access to the source document for this chunk. document_sets: all document sets the source document for this chunk is a part of. This is used for filtering / personas. boost: influences the ranking of this chunk at query time. Positive -> ranked higher, negative -> ranked lower. """ tenant_id: str access: "DocumentAccess" document_sets: set[str] boost: int @classmethod def from_index_chunk( cls, index_chunk: IndexChunk, access: "DocumentAccess", document_sets: set[str], boost: int, tenant_id: str, ) -> "DocMetadataAwareIndexChunk": index_chunk_data = index_chunk.model_dump() return cls( **index_chunk_data, access=access, document_sets=document_sets, boost=boost, tenant_id=tenant_id, ) class EmbeddingModelDetail(BaseModel): id: int | None = None model_name: str normalize: bool query_prefix: str | None passage_prefix: str | None api_url: str | None = None provider_type: EmbeddingProvider | None = None api_key: str | None = None # This disables the "model_" protected namespace for pydantic model_config = {"protected_namespaces": ()} @classmethod def from_db_model( cls, search_settings: "SearchSettings", ) -> "EmbeddingModelDetail": return cls( id=search_settings.id, model_name=search_settings.model_name, normalize=search_settings.normalize, query_prefix=search_settings.query_prefix, passage_prefix=search_settings.passage_prefix, provider_type=search_settings.provider_type, api_key=search_settings.api_key, api_url=search_settings.api_url, ) # Additional info needed for indexing time class IndexingSetting(EmbeddingModelDetail): model_dim: int index_name: str | None multipass_indexing: bool embedding_precision: EmbeddingPrecision reduced_dimension: int | None = None background_reindex_enabled: bool = True # This disables the "model_" protected namespace for pydantic model_config = {"protected_namespaces": ()} @property def final_embedding_dim(self) -> int: if self.reduced_dimension: return self.reduced_dimension return self.model_dim @classmethod def from_db_model(cls, search_settings: "SearchSettings") -> "IndexingSetting": return cls( model_name=search_settings.model_name, model_dim=search_settings.model_dim, normalize=search_settings.normalize, query_prefix=search_settings.query_prefix, passage_prefix=search_settings.passage_prefix, provider_type=search_settings.provider_type, index_name=search_settings.index_name, multipass_indexing=search_settings.multipass_indexing, embedding_precision=search_settings.embedding_precision, reduced_dimension=search_settings.reduced_dimension, background_reindex_enabled=search_settings.background_reindex_enabled, ) class MultipassConfig(BaseModel): multipass_indexing: bool enable_large_chunks: bool