mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-15 06:10:15 +02:00
* v1 * update indexing logic * update updates * nit * clean up args * update for clarity + best practices * nit + logs * fix * minor clean up * remove logs * quick nit
448 lines
15 KiB
Python
448 lines
15 KiB
Python
import abc
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
from onyx.access.models import DocumentAccess
|
|
from onyx.context.search.models import IndexFilters
|
|
from onyx.context.search.models import InferenceChunkUncleaned
|
|
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
|
from shared_configs.model_server_models import Embedding
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DocumentInsertionRecord:
|
|
document_id: str
|
|
already_existed: bool
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VespaChunkRequest:
|
|
document_id: str
|
|
min_chunk_ind: int | None = None
|
|
max_chunk_ind: int | None = None
|
|
|
|
@property
|
|
def is_capped(self) -> bool:
|
|
# If the max chunk index is not None, then the chunk request is capped
|
|
# If the min chunk index is None, we can assume the min is 0
|
|
return self.max_chunk_ind is not None
|
|
|
|
@property
|
|
def range(self) -> int | None:
|
|
if self.max_chunk_ind is not None:
|
|
return (self.max_chunk_ind - (self.min_chunk_ind or 0)) + 1
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class IndexBatchParams:
|
|
"""
|
|
Information necessary for efficiently indexing a batch of documents
|
|
"""
|
|
|
|
doc_id_to_previous_chunk_cnt: dict[str, int | None]
|
|
doc_id_to_new_chunk_cnt: dict[str, int]
|
|
tenant_id: str | None
|
|
large_chunks_enabled: bool
|
|
|
|
|
|
@dataclass
|
|
class MinimalDocumentIndexingInfo:
|
|
"""
|
|
Minimal information necessary for indexing a document
|
|
"""
|
|
|
|
doc_id: str
|
|
chunk_start_index: int
|
|
|
|
|
|
@dataclass
|
|
class EnrichedDocumentIndexingInfo(MinimalDocumentIndexingInfo):
|
|
"""
|
|
Enriched information necessary for indexing a document, including version and chunk range.
|
|
"""
|
|
|
|
old_version: bool
|
|
chunk_end_index: int
|
|
|
|
|
|
@dataclass
|
|
class DocumentMetadata:
|
|
"""
|
|
Document information that needs to be inserted into Postgres on first time encountering this
|
|
document during indexing across any of the connectors.
|
|
"""
|
|
|
|
connector_id: int
|
|
credential_id: int
|
|
document_id: str
|
|
semantic_identifier: str
|
|
first_link: str
|
|
doc_updated_at: datetime | None = None
|
|
# Emails, not necessarily attached to users
|
|
# Users may not be in Onyx
|
|
primary_owners: list[str] | None = None
|
|
secondary_owners: list[str] | None = None
|
|
from_ingestion_api: bool = False
|
|
|
|
|
|
@dataclass
|
|
class VespaDocumentFields:
|
|
"""
|
|
Specifies fields in Vespa for a document. Fields set to None will be ignored.
|
|
Perhaps we should name this in an implementation agnostic fashion, but it's more
|
|
understandable like this for now.
|
|
"""
|
|
|
|
# all other fields except these 4 will always be left alone by the update request
|
|
access: DocumentAccess | None = None
|
|
document_sets: set[str] | None = None
|
|
boost: float | None = None
|
|
hidden: bool | None = None
|
|
|
|
|
|
@dataclass
|
|
class UpdateRequest:
|
|
"""
|
|
For all document_ids, update the allowed_users and the boost to the new values
|
|
Does not update any of the None fields
|
|
"""
|
|
|
|
minimal_document_indexing_info: list[MinimalDocumentIndexingInfo]
|
|
# all other fields except these 4 will always be left alone by the update request
|
|
access: DocumentAccess | None = None
|
|
document_sets: set[str] | None = None
|
|
boost: float | None = None
|
|
hidden: bool | None = None
|
|
|
|
|
|
class Verifiable(abc.ABC):
|
|
"""
|
|
Class must implement document index schema verification. For example, verify that all of the
|
|
necessary attributes for indexing, querying, filtering, and fields to return from search are
|
|
all valid in the schema.
|
|
|
|
Parameters:
|
|
- index_name: The name of the primary index currently used for querying
|
|
- secondary_index_name: The name of the secondary index being built in the background, if it
|
|
currently exists. Some functions on the document index act on both the primary and
|
|
secondary index, some act on just one.
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def __init__(
|
|
self,
|
|
index_name: str,
|
|
secondary_index_name: str | None,
|
|
*args: Any,
|
|
**kwargs: Any,
|
|
) -> None:
|
|
super().__init__(*args, **kwargs)
|
|
self.index_name = index_name
|
|
self.secondary_index_name = secondary_index_name
|
|
|
|
@abc.abstractmethod
|
|
def ensure_indices_exist(
|
|
self,
|
|
index_embedding_dim: int,
|
|
secondary_index_embedding_dim: int | None,
|
|
) -> None:
|
|
"""
|
|
Verify that the document index exists and is consistent with the expectations in the code.
|
|
|
|
Parameters:
|
|
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
|
|
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
|
|
behind the scenes. The secondary index should only be built when switching
|
|
embedding models therefore this dim should be different from the primary index.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@staticmethod
|
|
@abc.abstractmethod
|
|
def register_multitenant_indices(
|
|
indices: list[str],
|
|
embedding_dims: list[int],
|
|
) -> None:
|
|
"""
|
|
Register multitenant indices with the document index.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class Indexable(abc.ABC):
|
|
"""
|
|
Class must implement the ability to index document chunks
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def index(
|
|
self,
|
|
chunks: list[DocMetadataAwareIndexChunk],
|
|
index_batch_params: IndexBatchParams,
|
|
) -> set[DocumentInsertionRecord]:
|
|
"""
|
|
Takes a list of document chunks and indexes them in the document index
|
|
|
|
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
|
|
chunks before reindexing. This is because the document may have gotten shorter since the
|
|
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
|
|
have not been written over.
|
|
|
|
NOTE: The chunks of a document are never separated into separate index() calls. So there is
|
|
no worry of receiving the first 0 through n chunks in one index call and the next n through
|
|
m chunks of a docu in the next index call.
|
|
|
|
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
|
|
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
|
it is done automatically outside of this code.
|
|
|
|
Parameters:
|
|
- chunks: Document chunks with all of the information needed for indexing to the document
|
|
index.
|
|
- tenant_id: The tenant id of the user whose chunks are being indexed
|
|
- large_chunks_enabled: Whether large chunks are enabled
|
|
|
|
Returns:
|
|
List of document ids which map to unique documents and are used for deduping chunks
|
|
when updating, as well as if the document is newly indexed or already existed and
|
|
just updated
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class Deletable(abc.ABC):
|
|
"""
|
|
Class must implement the ability to delete document by a given unique document id.
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def delete_single(
|
|
self,
|
|
doc_id: str,
|
|
*,
|
|
tenant_id: str | None,
|
|
chunk_count: int | None,
|
|
) -> int:
|
|
"""
|
|
Given a single document id, hard delete it from the document index
|
|
|
|
Parameters:
|
|
- doc_id: document id as specified by the connector
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class Updatable(abc.ABC):
|
|
"""
|
|
Class must implement the ability to update certain attributes of a document without needing to
|
|
update all of the fields. Specifically, needs to be able to update:
|
|
- Access Control List
|
|
- Document-set membership
|
|
- Boost value (learning from feedback mechanism)
|
|
- Whether the document is hidden or not, hidden documents are not returned from search
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def update_single(
|
|
self,
|
|
doc_id: str,
|
|
*,
|
|
tenant_id: str | None,
|
|
chunk_count: int | None,
|
|
fields: VespaDocumentFields,
|
|
) -> int:
|
|
"""
|
|
Updates all chunks for a document with the specified fields.
|
|
None values mean that the field does not need an update.
|
|
|
|
The rationale for a single update function is that it allows retries and parallelism
|
|
to happen at a higher / more strategic level, is simpler to read, and allows
|
|
us to individually handle error conditions per document.
|
|
|
|
Parameters:
|
|
- fields: the fields to update in the document. Any field set to None will not be changed.
|
|
|
|
Return:
|
|
None
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abc.abstractmethod
|
|
def update(
|
|
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
|
|
) -> None:
|
|
"""
|
|
Updates some set of chunks. The document and fields to update are specified in the update
|
|
requests. Each update request in the list applies its changes to a list of document ids.
|
|
None values mean that the field does not need an update.
|
|
|
|
Parameters:
|
|
- update_requests: for a list of document ids in the update request, apply the same updates
|
|
to all of the documents with those ids. This is for bulk handling efficiency. Many
|
|
updates are done at the connector level which have many documents for the connector
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class IdRetrievalCapable(abc.ABC):
|
|
"""
|
|
Class must implement the ability to retrieve either:
|
|
- all of the chunks of a document IN ORDER given a document id.
|
|
- a specific chunk given a document id and a chunk index (0 based)
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def id_based_retrieval(
|
|
self,
|
|
chunk_requests: list[VespaChunkRequest],
|
|
filters: IndexFilters,
|
|
batch_retrieval: bool = False,
|
|
) -> list[InferenceChunkUncleaned]:
|
|
"""
|
|
Fetch chunk(s) based on document id
|
|
|
|
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
|
|
of a document. Downstream currently assumes that the chunking does not introduce overlaps
|
|
between the chunks. If there are overlaps for the chunks, then the reconstructed document
|
|
or extended section will have duplicate segments.
|
|
|
|
Parameters:
|
|
- chunk_requests: requests containing the document id and the chunk range to retrieve
|
|
- filters: Filters to apply to retrieval
|
|
- batch_retrieval: If True, perform a batch retrieval
|
|
|
|
Returns:
|
|
list of chunks for the document id or the specific chunk by the specified chunk index
|
|
and document id
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class HybridCapable(abc.ABC):
|
|
"""
|
|
Class must implement hybrid (keyword + vector) search functionality
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def hybrid_retrieval(
|
|
self,
|
|
query: str,
|
|
query_embedding: Embedding,
|
|
final_keywords: list[str] | None,
|
|
filters: IndexFilters,
|
|
hybrid_alpha: float,
|
|
time_decay_multiplier: float,
|
|
num_to_retrieve: int,
|
|
offset: int = 0,
|
|
) -> list[InferenceChunkUncleaned]:
|
|
"""
|
|
Run hybrid search and return a list of inference chunks.
|
|
|
|
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
|
expected to be handled by this function as it may depend on the index implementation.
|
|
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
|
done here.
|
|
|
|
Parameters:
|
|
- query: unmodified user query. This is needed for getting the matching highlighted
|
|
keywords
|
|
- query_embedding: vector representation of the query, must be of the correct
|
|
dimensionality for the primary index
|
|
- final_keywords: Final keywords to be used from the query, defaults to query if not set
|
|
- filters: standard filter object
|
|
- hybrid_alpha: weighting between the keyword and vector search results. It is important
|
|
that the two scores are normalized to the same range so that a meaningful
|
|
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
|
|
on keyword score.
|
|
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
|
based on the persona settings, will have this be a 2x or 3x of the default
|
|
- num_to_retrieve: number of highest matching chunks to return
|
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
|
|
|
Returns:
|
|
best matching chunks based on weighted sum of keyword and vector/semantic search scores
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class AdminCapable(abc.ABC):
|
|
"""
|
|
Class must implement a search for the admin "Explorer" page. The assumption here is that the
|
|
admin is not "searching" for knowledge but has some document already in mind. They are either
|
|
looking to positively boost it because they know it's a good reference document, looking to
|
|
negatively boost it as a way of "deprecating", or hiding the document.
|
|
|
|
Assuming the admin knows the document name, this search has high emphasis on the title match.
|
|
|
|
Suggested implementation:
|
|
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def admin_retrieval(
|
|
self,
|
|
query: str,
|
|
filters: IndexFilters,
|
|
num_to_retrieve: int,
|
|
offset: int = 0,
|
|
) -> list[InferenceChunkUncleaned]:
|
|
"""
|
|
Run the special search for the admin document explorer page
|
|
|
|
Parameters:
|
|
- query: unmodified user query. Though in this flow probably unmodified is best
|
|
- filters: standard filter object
|
|
- num_to_retrieve: number of highest matching chunks to return
|
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
|
|
|
Returns:
|
|
list of best matching chunks for the explorer page query
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class RandomCapable(abc.ABC):
|
|
"""Class must implement random document retrieval capability"""
|
|
|
|
@abc.abstractmethod
|
|
def random_retrieval(
|
|
self,
|
|
filters: IndexFilters,
|
|
num_to_retrieve: int = 10,
|
|
) -> list[InferenceChunkUncleaned]:
|
|
"""Retrieve random chunks matching the filters"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class BaseIndex(
|
|
Verifiable,
|
|
Indexable,
|
|
Updatable,
|
|
Deletable,
|
|
AdminCapable,
|
|
IdRetrievalCapable,
|
|
RandomCapable,
|
|
abc.ABC,
|
|
):
|
|
"""
|
|
All basic document index functionalities excluding the actual querying approach.
|
|
|
|
As a summary, document indices need to be able to
|
|
- Verify the schema definition is valid
|
|
- Index new documents
|
|
- Update specific attributes of existing documents
|
|
- Delete documents
|
|
- Provide a search for the admin document explorer page
|
|
- Retrieve documents based on document id
|
|
"""
|
|
|
|
|
|
class DocumentIndex(HybridCapable, BaseIndex, abc.ABC):
|
|
"""
|
|
A valid document index that can plug into all Onyx flows must implement all of these
|
|
functionalities, though "technically" it does not need to be keyword or vector capable as
|
|
currently all default search flows use Hybrid Search.
|
|
"""
|