import abc from dataclasses import dataclass from datetime import datetime from typing import Any from onyx.access.models import DocumentAccess from onyx.context.search.models import IndexFilters from onyx.context.search.models import InferenceChunkUncleaned from onyx.indexing.models import DocMetadataAwareIndexChunk from shared_configs.model_server_models import Embedding @dataclass(frozen=True) class DocumentInsertionRecord: document_id: str already_existed: bool @dataclass(frozen=True) class VespaChunkRequest: document_id: str min_chunk_ind: int | None = None max_chunk_ind: int | None = None @property def is_capped(self) -> bool: # If the max chunk index is not None, then the chunk request is capped # If the min chunk index is None, we can assume the min is 0 return self.max_chunk_ind is not None @property def range(self) -> int | None: if self.max_chunk_ind is not None: return (self.max_chunk_ind - (self.min_chunk_ind or 0)) + 1 return None @dataclass class IndexBatchParams: """ Information necessary for efficiently indexing a batch of documents """ doc_id_to_previous_chunk_cnt: dict[str, int | None] doc_id_to_new_chunk_cnt: dict[str, int] tenant_id: str | None large_chunks_enabled: bool @dataclass class MinimalDocumentIndexingInfo: """ Minimal information necessary for indexing a document """ doc_id: str chunk_start_index: int @dataclass class EnrichedDocumentIndexingInfo(MinimalDocumentIndexingInfo): """ Enriched information necessary for indexing a document, including version and chunk range. """ old_version: bool chunk_end_index: int @dataclass class DocumentMetadata: """ Document information that needs to be inserted into Postgres on first time encountering this document during indexing across any of the connectors. """ connector_id: int credential_id: int document_id: str semantic_identifier: str first_link: str doc_updated_at: datetime | None = None # Emails, not necessarily attached to users # Users may not be in Onyx primary_owners: list[str] | None = None secondary_owners: list[str] | None = None from_ingestion_api: bool = False @dataclass class VespaDocumentFields: """ Specifies fields in Vespa for a document. Fields set to None will be ignored. Perhaps we should name this in an implementation agnostic fashion, but it's more understandable like this for now. """ # all other fields except these 4 will always be left alone by the update request access: DocumentAccess | None = None document_sets: set[str] | None = None boost: float | None = None hidden: bool | None = None @dataclass class UpdateRequest: """ For all document_ids, update the allowed_users and the boost to the new values Does not update any of the None fields """ minimal_document_indexing_info: list[MinimalDocumentIndexingInfo] # all other fields except these 4 will always be left alone by the update request access: DocumentAccess | None = None document_sets: set[str] | None = None boost: float | None = None hidden: bool | None = None class Verifiable(abc.ABC): """ Class must implement document index schema verification. For example, verify that all of the necessary attributes for indexing, querying, filtering, and fields to return from search are all valid in the schema. Parameters: - index_name: The name of the primary index currently used for querying - secondary_index_name: The name of the secondary index being built in the background, if it currently exists. Some functions on the document index act on both the primary and secondary index, some act on just one. """ @abc.abstractmethod def __init__( self, index_name: str, secondary_index_name: str | None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.index_name = index_name self.secondary_index_name = secondary_index_name @abc.abstractmethod def ensure_indices_exist( self, index_embedding_dim: int, secondary_index_embedding_dim: int | None, ) -> None: """ Verify that the document index exists and is consistent with the expectations in the code. Parameters: - index_embedding_dim: Vector dimensionality for the vector similarity part of the search - secondary_index_embedding_dim: Vector dimensionality of the secondary index being built behind the scenes. The secondary index should only be built when switching embedding models therefore this dim should be different from the primary index. """ raise NotImplementedError @staticmethod @abc.abstractmethod def register_multitenant_indices( indices: list[str], embedding_dims: list[int], ) -> None: """ Register multitenant indices with the document index. """ raise NotImplementedError class Indexable(abc.ABC): """ Class must implement the ability to index document chunks """ @abc.abstractmethod def index( self, chunks: list[DocMetadataAwareIndexChunk], index_batch_params: IndexBatchParams, ) -> set[DocumentInsertionRecord]: """ Takes a list of document chunks and indexes them in the document index NOTE: When a document is reindexed/updated here, it must clear all of the existing document chunks before reindexing. This is because the document may have gotten shorter since the last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that have not been written over. NOTE: The chunks of a document are never separated into separate index() calls. So there is no worry of receiving the first 0 through n chunks in one index call and the next n through m chunks of a docu in the next index call. NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function only needs to index chunks into the PRIMARY index. Do not update the secondary index here, it is done automatically outside of this code. Parameters: - chunks: Document chunks with all of the information needed for indexing to the document index. - tenant_id: The tenant id of the user whose chunks are being indexed - large_chunks_enabled: Whether large chunks are enabled Returns: List of document ids which map to unique documents and are used for deduping chunks when updating, as well as if the document is newly indexed or already existed and just updated """ raise NotImplementedError class Deletable(abc.ABC): """ Class must implement the ability to delete document by a given unique document id. """ @abc.abstractmethod def delete_single( self, doc_id: str, *, tenant_id: str | None, chunk_count: int | None, ) -> int: """ Given a single document id, hard delete it from the document index Parameters: - doc_id: document id as specified by the connector """ raise NotImplementedError class Updatable(abc.ABC): """ Class must implement the ability to update certain attributes of a document without needing to update all of the fields. Specifically, needs to be able to update: - Access Control List - Document-set membership - Boost value (learning from feedback mechanism) - Whether the document is hidden or not, hidden documents are not returned from search """ @abc.abstractmethod def update_single( self, doc_id: str, *, tenant_id: str | None, chunk_count: int | None, fields: VespaDocumentFields, ) -> int: """ Updates all chunks for a document with the specified fields. None values mean that the field does not need an update. The rationale for a single update function is that it allows retries and parallelism to happen at a higher / more strategic level, is simpler to read, and allows us to individually handle error conditions per document. Parameters: - fields: the fields to update in the document. Any field set to None will not be changed. Return: None """ raise NotImplementedError @abc.abstractmethod def update( self, update_requests: list[UpdateRequest], *, tenant_id: str | None ) -> None: """ Updates some set of chunks. The document and fields to update are specified in the update requests. Each update request in the list applies its changes to a list of document ids. None values mean that the field does not need an update. Parameters: - update_requests: for a list of document ids in the update request, apply the same updates to all of the documents with those ids. This is for bulk handling efficiency. Many updates are done at the connector level which have many documents for the connector """ raise NotImplementedError class IdRetrievalCapable(abc.ABC): """ Class must implement the ability to retrieve either: - all of the chunks of a document IN ORDER given a document id. - a specific chunk given a document id and a chunk index (0 based) """ @abc.abstractmethod def id_based_retrieval( self, chunk_requests: list[VespaChunkRequest], filters: IndexFilters, batch_retrieval: bool = False, ) -> list[InferenceChunkUncleaned]: """ Fetch chunk(s) based on document id NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section of a document. Downstream currently assumes that the chunking does not introduce overlaps between the chunks. If there are overlaps for the chunks, then the reconstructed document or extended section will have duplicate segments. Parameters: - chunk_requests: requests containing the document id and the chunk range to retrieve - filters: Filters to apply to retrieval - batch_retrieval: If True, perform a batch retrieval Returns: list of chunks for the document id or the specific chunk by the specified chunk index and document id """ raise NotImplementedError class HybridCapable(abc.ABC): """ Class must implement hybrid (keyword + vector) search functionality """ @abc.abstractmethod def hybrid_retrieval( self, query: str, query_embedding: Embedding, final_keywords: list[str] | None, filters: IndexFilters, hybrid_alpha: float, time_decay_multiplier: float, num_to_retrieve: int, offset: int = 0, ) -> list[InferenceChunkUncleaned]: """ Run hybrid search and return a list of inference chunks. NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is expected to be handled by this function as it may depend on the index implementation. Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are done here. Parameters: - query: unmodified user query. This is needed for getting the matching highlighted keywords - query_embedding: vector representation of the query, must be of the correct dimensionality for the primary index - final_keywords: Final keywords to be used from the query, defaults to query if not set - filters: standard filter object - hybrid_alpha: weighting between the keyword and vector search results. It is important that the two scores are normalized to the same range so that a meaningful comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting on keyword score. - time_decay_multiplier: how much to decay the document scores as they age. Some queries based on the persona settings, will have this be a 2x or 3x of the default - num_to_retrieve: number of highest matching chunks to return - offset: number of highest matching chunks to skip (kind of like pagination) Returns: best matching chunks based on weighted sum of keyword and vector/semantic search scores """ raise NotImplementedError class AdminCapable(abc.ABC): """ Class must implement a search for the admin "Explorer" page. The assumption here is that the admin is not "searching" for knowledge but has some document already in mind. They are either looking to positively boost it because they know it's a good reference document, looking to negatively boost it as a way of "deprecating", or hiding the document. Assuming the admin knows the document name, this search has high emphasis on the title match. Suggested implementation: Keyword only, BM25 search with 5x weighting on the title field compared to the contents """ @abc.abstractmethod def admin_retrieval( self, query: str, filters: IndexFilters, num_to_retrieve: int, offset: int = 0, ) -> list[InferenceChunkUncleaned]: """ Run the special search for the admin document explorer page Parameters: - query: unmodified user query. Though in this flow probably unmodified is best - filters: standard filter object - num_to_retrieve: number of highest matching chunks to return - offset: number of highest matching chunks to skip (kind of like pagination) Returns: list of best matching chunks for the explorer page query """ raise NotImplementedError class RandomCapable(abc.ABC): """Class must implement random document retrieval capability""" @abc.abstractmethod def random_retrieval( self, filters: IndexFilters, num_to_retrieve: int = 10, ) -> list[InferenceChunkUncleaned]: """Retrieve random chunks matching the filters""" raise NotImplementedError class BaseIndex( Verifiable, Indexable, Updatable, Deletable, AdminCapable, IdRetrievalCapable, RandomCapable, abc.ABC, ): """ All basic document index functionalities excluding the actual querying approach. As a summary, document indices need to be able to - Verify the schema definition is valid - Index new documents - Update specific attributes of existing documents - Delete documents - Provide a search for the admin document explorer page - Retrieve documents based on document id """ class DocumentIndex(HybridCapable, BaseIndex, abc.ABC): """ A valid document index that can plug into all Onyx flows must implement all of these functionalities, though "technically" it does not need to be keyword or vector capable as currently all default search flows use Hybrid Search. """