Better description of the document index interfaces (#1188)

2025-09-26 11:58:28 +02:00 · 2024-03-06 00:07:12 -08:00
parent 2ace03081c
commit 3f1cd1ad12
2 changed files with 232 additions and 27 deletions
--- a/backend/danswer/document_index/interfaces.py
+++ b/backend/danswer/document_index/interfaces.py
@@ -17,6 +17,11 @@ class DocumentInsertionRecord:
@dataclass
 class DocumentMetadata:
    """
    Document information that needs to be inserted into Postgres on first time encountering this
    document during indexing across any of the connectors.
    """
    connector_id: int
    credential_id: int
    document_id: str
@@ -32,11 +37,13 @@ class DocumentMetadata:
@dataclass
 class UpdateRequest:
-    """For all document_ids, update the allowed_users and the boost to the new value
+    """
-    ignore if None"""
+    For all document_ids, update the allowed_users and the boost to the new values
    Does not update any of the None fields
    """
    document_ids: list[str]
-    # all other fields will be left alone
+    # all other fields except these 4 will always be left alone by the update request
    access: DocumentAccess | None = None
    document_sets: set[str] | None = None
    boost: float | None = None
@@ -44,6 +51,18 @@ class UpdateRequest:
 class Verifiable(abc.ABC):
    """
    Class must implement document index schema verification. For example, verify that all of the
    necessary attributes for indexing, querying, filtering, and fields to return from search are
    all valid in the schema.
    Parameters:
    - index_name: The name of the primary index currently used for querying
    - secondary_index_name: The name of the secondary index being built in the background, if it
            currently exists. Some functions on the document index act on both the primary and
            secondary index, some act on just one.
    """
    @abc.abstractmethod
    def __init__(
        self,
@@ -62,34 +81,104 @@ class Verifiable(abc.ABC):
        index_embedding_dim: int,
        secondary_index_embedding_dim: int | None,
    ) -> None:
        """
        Verify that the document index exists and is consistent with the expectations in the code.
        Parameters:
        - index_embedding_dim: Vector dimensionality for the vector similarity part of the search
        - secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
                behind the scenes. The secondary index should only be built when switching
                embedding models therefore this dim should be different from the primary index.
        """
        raise NotImplementedError
 class Indexable(abc.ABC):
    """
    Class must implement the ability to index document chunks
    """
    @abc.abstractmethod
    def index(
        self,
        chunks: list[DocMetadataAwareIndexChunk],
    ) -> set[DocumentInsertionRecord]:
-        """Indexes document chunks into the Document Index and return the IDs of all the documents indexed"""
+        """
        Takes a list of document chunks and indexes them in the document index
        NOTE: When a document is reindexed/updated here, it must clear all of the existing document
        chunks before reindexing. This is because the document may have gotten shorter since the
        last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
        have not been written over.
        NOTE: The chunks of a document are never separated into separate index() calls. So there is
        no worry of receiving the first 0 through n chunks in one index call and the next n through
        m chunks of a docu in the next index call.
        NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
        only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
        it is done automatically outside of this code.
        Parameters:
        - chunks: Document chunks with all of the information needed for indexing to the document
                index.
        Returns:
            List of document ids which map to unique documents and are used for deduping chunks
            when updating, as well as if the document is newly indexed or already existed and
            just updated
        """
        raise NotImplementedError
 class Deletable(abc.ABC):
    """
    Class must implement the ability to delete document by their unique document ids.
    """
    @abc.abstractmethod
    def delete(self, doc_ids: list[str]) -> None:
-        """Removes the specified documents from the Index"""
+        """
        Given a list of document ids, hard delete them from the document index
        Parameters:
        - doc_ids: list of document ids as specified by the connector
        """
        raise NotImplementedError
 class Updatable(abc.ABC):
    """
    Class must implement the ability to update certain attributes of a document without needing to
    update all of the fields. Specifically, needs to be able to update:
    - Access Control List
    - Document-set membership
    - Boost value (learning from feedback mechanism)
    - Whether the document is hidden or not, hidden documents are not returned from search
    """
    @abc.abstractmethod
    def update(self, update_requests: list[UpdateRequest]) -> None:
-        """Updates metadata for the specified documents sets in the Index"""
+        """
        Updates some set of chunks. The document and fields to update are specified in the update
        requests. Each update request in the list applies its changes to a list of document ids.
        None values mean that the field does not need an update.
        Parameters:
        - update_requests: for a list of document ids in the update request, apply the same updates
                to all of the documents with those ids. This is for bulk handling efficiency. Many
                updates are done at the connector level which have many documents for the connector
        """
        raise NotImplementedError
 class IdRetrievalCapable(abc.ABC):
    """
    Class must implement the ability to retrieve either:
    - all of the chunks of a document IN ORDER given a document id.
    - a specific chunk given a document id and a chunk index (0 based)
    """
    @abc.abstractmethod
    def id_based_retrieval(
        self,
@@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC):
        chunk_ind: int | None,
        filters: IndexFilters,
    ) -> list[InferenceChunk]:
        """
        Fetch chunk(s) based on document id
        NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
        of a document. Downstream currently assumes that the chunking does not introduce overlaps
        between the chunks. If there are overlaps for the chunks, then the reconstructed document
        or extended section will have duplicate segments.
        Parameters:
        - document_id: document id for which to retrieve the chunk(s)
        - chunk_ind: chunk index to return, if None, return all of the chunks in order
        - filters: standard filters object, in this case only the access filter is applied as a
                permission check
        Returns:
            list of chunks for the document id or the specific chunk by the specified chunk index
            and document id
        """
        raise NotImplementedError
 class KeywordCapable(abc.ABC):
    """
    Class must implement the keyword search functionality
    """
    @abc.abstractmethod
    def keyword_retrieval(
        self,
@@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
        """
        Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
        information required for query time purposes. For example, some details of the document
        required at indexing time are no longer needed past this point. At the same time, the
        matching keywords need to be highlighted.
        NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
        expected to be handled by this function as it may depend on the index implementation.
        Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
        done here.
        Parameters:
        - query: unmodified user query
        - filters: standard filter object
        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
                based on the persona settings, will have this be a 2x or 3x of the default
        - num_to_retrieve: number of highest matching chunks to return
        - offset: number of highest matching chunks to skip (kind of like pagination)
        Returns:
            best matching chunks based on keyword matching (should be BM25 algorithm ideally)
        """
        raise NotImplementedError
 class VectorCapable(abc.ABC):
    """
    Class must implement the vector/semantic search functionality
    """
    @abc.abstractmethod
    def semantic_retrieval(
        self,
@@ -124,10 +261,31 @@ class VectorCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
        """
        Run vector/semantic search and return a list of inference chunks.
        Parameters:
        - query: unmodified user query. This is needed for getting the matching highlighted
                keywords
        - query_embedding: vector representation of the query, must be of the correct
                dimensionality for the primary index
        - filters: standard filter object
        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
                based on the persona settings, will have this be a 2x or 3x of the default
        - num_to_retrieve: number of highest matching chunks to return
        - offset: number of highest matching chunks to skip (kind of like pagination)
        Returns:
            best matching chunks based on vector similarity
        """
        raise NotImplementedError
 class HybridCapable(abc.ABC):
    """
    Class must implement hybrid (keyword + vector) search functionality
    """
    @abc.abstractmethod
    def hybrid_retrieval(
        self,
@@ -139,10 +297,48 @@ class HybridCapable(abc.ABC):
        offset: int = 0,
        hybrid_alpha: float | None = None,
    ) -> list[InferenceChunk]:
        """
        Run hybrid search and return a list of inference chunks.
        NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
        expected to be handled by this function as it may depend on the index implementation.
        Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
        done here.
        Parameters:
        - query: unmodified user query. This is needed for getting the matching highlighted
                keywords
        - query_embedding: vector representation of the query, must be of the correct
                dimensionality for the primary index
        - filters: standard filter object
        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
                based on the persona settings, will have this be a 2x or 3x of the default
        - num_to_retrieve: number of highest matching chunks to return
        - offset: number of highest matching chunks to skip (kind of like pagination)
        - hybrid_alpha: weighting between the keyword and vector search results. It is important
                that the two scores are normalized to the same range so that a meaningful
                comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
                on keyword score.
        Returns:
            best matching chunks based on weighted sum of keyword and vector/semantic search scores
        """
        raise NotImplementedError
 class AdminCapable(abc.ABC):
    """
    Class must implement a search for the admin "Explorer" page. The assumption here is that the
    admin is not "searching" for knowledge but has some document already in mind. They are either
    looking to positively boost it because they know it's a good reference document, looking to
    negatively boost it as a way of "deprecating", or hiding the document.
    Assuming the admin knows the document name, this search has high emphasis on the title match.
    Suggested implementation:
    Keyword only, BM25 search with 5x weighting on the title field compared to the contents
    """
    @abc.abstractmethod
    def admin_retrieval(
        self,
@@ -151,34 +347,46 @@ class AdminCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
        """
        Run the special search for the admin document explorer page
        Parameters:
        - query: unmodified user query. Though in this flow probably unmodified is best
        - filters: standard filter object
        - num_to_retrieve: number of highest matching chunks to return
        - offset: number of highest matching chunks to skip (kind of like pagination)
        Returns:
            list of best matching chunks for the explorer page query
        """
        raise NotImplementedError
 class BaseIndex(
    Verifiable,
    AdminCapable,
    IdRetrievalCapable,
    Indexable,
    Updatable,
    Deletable,
    AdminCapable,
    IdRetrievalCapable,
    abc.ABC,
 ):
-    """All basic functionalities excluding a specific retrieval approach
+    """
-    Indices need to be able to
+    All basic document index functionalities excluding the actual querying approach.
-    - Check that the index exists with a schema definition
+
-    - Can index documents
+    As a summary, document indices need to be able to
-    - Can delete documents
+    - Verify the schema definition is valid
-    - Can update document metadata (such as access permissions and document specific boost)
+    - Index new documents
    - Update specific attributes of existing documents
    - Delete documents
    - Provide a search for the admin document explorer page
    - Retrieve documents based on document id
    """
 class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC):
    pass
 class VectorIndex(VectorCapable, BaseIndex, abc.ABC):
    pass
 class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
-    pass
+    """
    A valid document index that can plug into all Danswer flows must implement all of these
    functionalities, though "technically" it does not need to be keyword or vector capable as
    currently all default search flows use Hybrid Search.
    """
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex):
    def delete(self, doc_ids: list[str]) -> None:
        logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
-        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
+        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
        # indexing / updates / deletes since we have to make a large volume of requests.
        with httpx.Client(http2=True) as http_client:
            index_names = [self.index_name]
@@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex):
                for vespa_chunk_id in vespa_chunk_ids
            ]
            logger.debug(
                "Running LLM usefulness eval in parallel (following logging may be out of order)"
            )
            inference_chunks = run_functions_tuples_in_parallel(
                functions_with_args, allow_failures=True
            )