diff --git a/backend/danswer/document_index/interfaces.py b/backend/danswer/document_index/interfaces.py index e528504aaec1..787ee3889ab1 100644 --- a/backend/danswer/document_index/interfaces.py +++ b/backend/danswer/document_index/interfaces.py @@ -17,6 +17,11 @@ class DocumentInsertionRecord: @dataclass class DocumentMetadata: + """ + Document information that needs to be inserted into Postgres on first time encountering this + document during indexing across any of the connectors. + """ + connector_id: int credential_id: int document_id: str @@ -32,11 +37,13 @@ class DocumentMetadata: @dataclass class UpdateRequest: - """For all document_ids, update the allowed_users and the boost to the new value - ignore if None""" + """ + For all document_ids, update the allowed_users and the boost to the new values + Does not update any of the None fields + """ document_ids: list[str] - # all other fields will be left alone + # all other fields except these 4 will always be left alone by the update request access: DocumentAccess | None = None document_sets: set[str] | None = None boost: float | None = None @@ -44,6 +51,18 @@ class UpdateRequest: class Verifiable(abc.ABC): + """ + Class must implement document index schema verification. For example, verify that all of the + necessary attributes for indexing, querying, filtering, and fields to return from search are + all valid in the schema. + + Parameters: + - index_name: The name of the primary index currently used for querying + - secondary_index_name: The name of the secondary index being built in the background, if it + currently exists. Some functions on the document index act on both the primary and + secondary index, some act on just one. + """ + @abc.abstractmethod def __init__( self, @@ -62,34 +81,104 @@ class Verifiable(abc.ABC): index_embedding_dim: int, secondary_index_embedding_dim: int | None, ) -> None: + """ + Verify that the document index exists and is consistent with the expectations in the code. + + Parameters: + - index_embedding_dim: Vector dimensionality for the vector similarity part of the search + - secondary_index_embedding_dim: Vector dimensionality of the secondary index being built + behind the scenes. The secondary index should only be built when switching + embedding models therefore this dim should be different from the primary index. + """ raise NotImplementedError class Indexable(abc.ABC): + """ + Class must implement the ability to index document chunks + """ + @abc.abstractmethod def index( self, chunks: list[DocMetadataAwareIndexChunk], ) -> set[DocumentInsertionRecord]: - """Indexes document chunks into the Document Index and return the IDs of all the documents indexed""" + """ + Takes a list of document chunks and indexes them in the document index + + NOTE: When a document is reindexed/updated here, it must clear all of the existing document + chunks before reindexing. This is because the document may have gotten shorter since the + last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that + have not been written over. + + NOTE: The chunks of a document are never separated into separate index() calls. So there is + no worry of receiving the first 0 through n chunks in one index call and the next n through + m chunks of a docu in the next index call. + + NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function + only needs to index chunks into the PRIMARY index. Do not update the secondary index here, + it is done automatically outside of this code. + + Parameters: + - chunks: Document chunks with all of the information needed for indexing to the document + index. + + Returns: + List of document ids which map to unique documents and are used for deduping chunks + when updating, as well as if the document is newly indexed or already existed and + just updated + """ raise NotImplementedError class Deletable(abc.ABC): + """ + Class must implement the ability to delete document by their unique document ids. + """ + @abc.abstractmethod def delete(self, doc_ids: list[str]) -> None: - """Removes the specified documents from the Index""" + """ + Given a list of document ids, hard delete them from the document index + + Parameters: + - doc_ids: list of document ids as specified by the connector + """ raise NotImplementedError class Updatable(abc.ABC): + """ + Class must implement the ability to update certain attributes of a document without needing to + update all of the fields. Specifically, needs to be able to update: + - Access Control List + - Document-set membership + - Boost value (learning from feedback mechanism) + - Whether the document is hidden or not, hidden documents are not returned from search + """ + @abc.abstractmethod def update(self, update_requests: list[UpdateRequest]) -> None: - """Updates metadata for the specified documents sets in the Index""" + """ + Updates some set of chunks. The document and fields to update are specified in the update + requests. Each update request in the list applies its changes to a list of document ids. + None values mean that the field does not need an update. + + Parameters: + - update_requests: for a list of document ids in the update request, apply the same updates + to all of the documents with those ids. This is for bulk handling efficiency. Many + updates are done at the connector level which have many documents for the connector + """ raise NotImplementedError class IdRetrievalCapable(abc.ABC): + """ + Class must implement the ability to retrieve either: + - all of the chunks of a document IN ORDER given a document id. + - a specific chunk given a document id and a chunk index (0 based) + """ + @abc.abstractmethod def id_based_retrieval( self, @@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC): chunk_ind: int | None, filters: IndexFilters, ) -> list[InferenceChunk]: + """ + Fetch chunk(s) based on document id + + NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section + of a document. Downstream currently assumes that the chunking does not introduce overlaps + between the chunks. If there are overlaps for the chunks, then the reconstructed document + or extended section will have duplicate segments. + + Parameters: + - document_id: document id for which to retrieve the chunk(s) + - chunk_ind: chunk index to return, if None, return all of the chunks in order + - filters: standard filters object, in this case only the access filter is applied as a + permission check + + Returns: + list of chunks for the document id or the specific chunk by the specified chunk index + and document id + """ raise NotImplementedError class KeywordCapable(abc.ABC): + """ + Class must implement the keyword search functionality + """ + @abc.abstractmethod def keyword_retrieval( self, @@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC): num_to_retrieve: int, offset: int = 0, ) -> list[InferenceChunk]: + """ + Run keyword search and return a list of chunks. Inference chunks are chunks with all of the + information required for query time purposes. For example, some details of the document + required at indexing time are no longer needed past this point. At the same time, the + matching keywords need to be highlighted. + + NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is + expected to be handled by this function as it may depend on the index implementation. + Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are + done here. + + Parameters: + - query: unmodified user query + - filters: standard filter object + - time_decay_multiplier: how much to decay the document scores as they age. Some queries + based on the persona settings, will have this be a 2x or 3x of the default + - num_to_retrieve: number of highest matching chunks to return + - offset: number of highest matching chunks to skip (kind of like pagination) + + Returns: + best matching chunks based on keyword matching (should be BM25 algorithm ideally) + """ raise NotImplementedError class VectorCapable(abc.ABC): + """ + Class must implement the vector/semantic search functionality + """ + @abc.abstractmethod def semantic_retrieval( self, @@ -124,10 +261,31 @@ class VectorCapable(abc.ABC): num_to_retrieve: int, offset: int = 0, ) -> list[InferenceChunk]: + """ + Run vector/semantic search and return a list of inference chunks. + + Parameters: + - query: unmodified user query. This is needed for getting the matching highlighted + keywords + - query_embedding: vector representation of the query, must be of the correct + dimensionality for the primary index + - filters: standard filter object + - time_decay_multiplier: how much to decay the document scores as they age. Some queries + based on the persona settings, will have this be a 2x or 3x of the default + - num_to_retrieve: number of highest matching chunks to return + - offset: number of highest matching chunks to skip (kind of like pagination) + + Returns: + best matching chunks based on vector similarity + """ raise NotImplementedError class HybridCapable(abc.ABC): + """ + Class must implement hybrid (keyword + vector) search functionality + """ + @abc.abstractmethod def hybrid_retrieval( self, @@ -139,10 +297,48 @@ class HybridCapable(abc.ABC): offset: int = 0, hybrid_alpha: float | None = None, ) -> list[InferenceChunk]: + """ + Run hybrid search and return a list of inference chunks. + + NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is + expected to be handled by this function as it may depend on the index implementation. + Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are + done here. + + Parameters: + - query: unmodified user query. This is needed for getting the matching highlighted + keywords + - query_embedding: vector representation of the query, must be of the correct + dimensionality for the primary index + - filters: standard filter object + - time_decay_multiplier: how much to decay the document scores as they age. Some queries + based on the persona settings, will have this be a 2x or 3x of the default + - num_to_retrieve: number of highest matching chunks to return + - offset: number of highest matching chunks to skip (kind of like pagination) + - hybrid_alpha: weighting between the keyword and vector search results. It is important + that the two scores are normalized to the same range so that a meaningful + comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting + on keyword score. + + Returns: + best matching chunks based on weighted sum of keyword and vector/semantic search scores + """ raise NotImplementedError class AdminCapable(abc.ABC): + """ + Class must implement a search for the admin "Explorer" page. The assumption here is that the + admin is not "searching" for knowledge but has some document already in mind. They are either + looking to positively boost it because they know it's a good reference document, looking to + negatively boost it as a way of "deprecating", or hiding the document. + + Assuming the admin knows the document name, this search has high emphasis on the title match. + + Suggested implementation: + Keyword only, BM25 search with 5x weighting on the title field compared to the contents + """ + @abc.abstractmethod def admin_retrieval( self, @@ -151,34 +347,46 @@ class AdminCapable(abc.ABC): num_to_retrieve: int, offset: int = 0, ) -> list[InferenceChunk]: + """ + Run the special search for the admin document explorer page + + Parameters: + - query: unmodified user query. Though in this flow probably unmodified is best + - filters: standard filter object + - num_to_retrieve: number of highest matching chunks to return + - offset: number of highest matching chunks to skip (kind of like pagination) + + Returns: + list of best matching chunks for the explorer page query + """ raise NotImplementedError class BaseIndex( Verifiable, - AdminCapable, - IdRetrievalCapable, Indexable, Updatable, Deletable, + AdminCapable, + IdRetrievalCapable, abc.ABC, ): - """All basic functionalities excluding a specific retrieval approach - Indices need to be able to - - Check that the index exists with a schema definition - - Can index documents - - Can delete documents - - Can update document metadata (such as access permissions and document specific boost) + """ + All basic document index functionalities excluding the actual querying approach. + + As a summary, document indices need to be able to + - Verify the schema definition is valid + - Index new documents + - Update specific attributes of existing documents + - Delete documents + - Provide a search for the admin document explorer page + - Retrieve documents based on document id """ -class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC): - pass - - -class VectorIndex(VectorCapable, BaseIndex, abc.ABC): - pass - - class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC): - pass + """ + A valid document index that can plug into all Danswer flows must implement all of these + functionalities, though "technically" it does not need to be keyword or vector capable as + currently all default search flows use Hybrid Search. + """ diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 4d58bee22ca4..178aadf3eea9 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex): def delete(self, doc_ids: list[str]) -> None: logger.info(f"Deleting {len(doc_ids)} documents from Vespa") - # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for + # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for # indexing / updates / deletes since we have to make a large volume of requests. with httpx.Client(http2=True) as http_client: index_names = [self.index_name] @@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex): for vespa_chunk_id in vespa_chunk_ids ] - logger.debug( - "Running LLM usefulness eval in parallel (following logging may be out of order)" - ) inference_chunks = run_functions_tuples_in_parallel( functions_with_args, allow_failures=True )