Better description of the document index interfaces (#1188)

2025-09-25 19:37:29 +02:00 · 2024-03-06 00:07:12 -08:00
parent 2ace03081c
commit 3f1cd1ad12
2 changed files with 232 additions and 27 deletions
--- a/backend/danswer/document_index/interfaces.py
+++ b/backend/danswer/document_index/interfaces.py
@@ -17,6 +17,11 @@ class DocumentInsertionRecord:

@dataclass
 class DocumentMetadata:
+    """
+    Document information that needs to be inserted into Postgres on first time encountering this
+    document during indexing across any of the connectors.
+    """
+
    connector_id: int
    credential_id: int
    document_id: str
@@ -32,11 +37,13 @@ class DocumentMetadata:

@dataclass
 class UpdateRequest:
-    """For all document_ids, update the allowed_users and the boost to the new value
-    ignore if None"""
+    """
+    For all document_ids, update the allowed_users and the boost to the new values
+    Does not update any of the None fields
+    """

    document_ids: list[str]
-    # all other fields will be left alone
+    # all other fields except these 4 will always be left alone by the update request
    access: DocumentAccess | None = None
    document_sets: set[str] | None = None
    boost: float | None = None
@@ -44,6 +51,18 @@ class UpdateRequest:


 class Verifiable(abc.ABC):
+    """
+    Class must implement document index schema verification. For example, verify that all of the
+    necessary attributes for indexing, querying, filtering, and fields to return from search are
+    all valid in the schema.
+
+    Parameters:
+    - index_name: The name of the primary index currently used for querying
+    - secondary_index_name: The name of the secondary index being built in the background, if it
+            currently exists. Some functions on the document index act on both the primary and
+            secondary index, some act on just one.
+    """
+
    @abc.abstractmethod
    def __init__(
        self,
@@ -62,34 +81,104 @@ class Verifiable(abc.ABC):
        index_embedding_dim: int,
        secondary_index_embedding_dim: int | None,
    ) -> None:
+        """
+        Verify that the document index exists and is consistent with the expectations in the code.
+
+        Parameters:
+        - index_embedding_dim: Vector dimensionality for the vector similarity part of the search
+        - secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
+                behind the scenes. The secondary index should only be built when switching
+                embedding models therefore this dim should be different from the primary index.
+        """
        raise NotImplementedError


 class Indexable(abc.ABC):
+    """
+    Class must implement the ability to index document chunks
+    """
+
    @abc.abstractmethod
    def index(
        self,
        chunks: list[DocMetadataAwareIndexChunk],
    ) -> set[DocumentInsertionRecord]:
-        """Indexes document chunks into the Document Index and return the IDs of all the documents indexed"""
+        """
+        Takes a list of document chunks and indexes them in the document index
+
+        NOTE: When a document is reindexed/updated here, it must clear all of the existing document
+        chunks before reindexing. This is because the document may have gotten shorter since the
+        last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
+        have not been written over.
+
+        NOTE: The chunks of a document are never separated into separate index() calls. So there is
+        no worry of receiving the first 0 through n chunks in one index call and the next n through
+        m chunks of a docu in the next index call.
+
+        NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
+        only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
+        it is done automatically outside of this code.
+
+        Parameters:
+        - chunks: Document chunks with all of the information needed for indexing to the document
+                index.
+
+        Returns:
+            List of document ids which map to unique documents and are used for deduping chunks
+            when updating, as well as if the document is newly indexed or already existed and
+            just updated
+        """
        raise NotImplementedError


 class Deletable(abc.ABC):
+    """
+    Class must implement the ability to delete document by their unique document ids.
+    """
+
    @abc.abstractmethod
    def delete(self, doc_ids: list[str]) -> None:
-        """Removes the specified documents from the Index"""
+        """
+        Given a list of document ids, hard delete them from the document index
+
+        Parameters:
+        - doc_ids: list of document ids as specified by the connector
+        """
        raise NotImplementedError


 class Updatable(abc.ABC):
+    """
+    Class must implement the ability to update certain attributes of a document without needing to
+    update all of the fields. Specifically, needs to be able to update:
+    - Access Control List
+    - Document-set membership
+    - Boost value (learning from feedback mechanism)
+    - Whether the document is hidden or not, hidden documents are not returned from search
+    """
+
    @abc.abstractmethod
    def update(self, update_requests: list[UpdateRequest]) -> None:
-        """Updates metadata for the specified documents sets in the Index"""
+        """
+        Updates some set of chunks. The document and fields to update are specified in the update
+        requests. Each update request in the list applies its changes to a list of document ids.
+        None values mean that the field does not need an update.
+
+        Parameters:
+        - update_requests: for a list of document ids in the update request, apply the same updates
+                to all of the documents with those ids. This is for bulk handling efficiency. Many
+                updates are done at the connector level which have many documents for the connector
+        """
        raise NotImplementedError


 class IdRetrievalCapable(abc.ABC):
+    """
+    Class must implement the ability to retrieve either:
+    - all of the chunks of a document IN ORDER given a document id.
+    - a specific chunk given a document id and a chunk index (0 based)
+    """
+
    @abc.abstractmethod
    def id_based_retrieval(
        self,
@@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC):
        chunk_ind: int | None,
        filters: IndexFilters,
    ) -> list[InferenceChunk]:
+        """
+        Fetch chunk(s) based on document id
+
+        NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
+        of a document. Downstream currently assumes that the chunking does not introduce overlaps
+        between the chunks. If there are overlaps for the chunks, then the reconstructed document
+        or extended section will have duplicate segments.
+
+        Parameters:
+        - document_id: document id for which to retrieve the chunk(s)
+        - chunk_ind: chunk index to return, if None, return all of the chunks in order
+        - filters: standard filters object, in this case only the access filter is applied as a
+                permission check
+
+        Returns:
+            list of chunks for the document id or the specific chunk by the specified chunk index
+            and document id
+        """
        raise NotImplementedError


 class KeywordCapable(abc.ABC):
+    """
+    Class must implement the keyword search functionality
+    """
+
    @abc.abstractmethod
    def keyword_retrieval(
        self,
@@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
+        """
+        Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
+        information required for query time purposes. For example, some details of the document
+        required at indexing time are no longer needed past this point. At the same time, the
+        matching keywords need to be highlighted.
+
+        NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
+        expected to be handled by this function as it may depend on the index implementation.
+        Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
+        done here.
+
+        Parameters:
+        - query: unmodified user query
+        - filters: standard filter object
+        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
+                based on the persona settings, will have this be a 2x or 3x of the default
+        - num_to_retrieve: number of highest matching chunks to return
+        - offset: number of highest matching chunks to skip (kind of like pagination)
+
+        Returns:
+            best matching chunks based on keyword matching (should be BM25 algorithm ideally)
+        """
        raise NotImplementedError


 class VectorCapable(abc.ABC):
+    """
+    Class must implement the vector/semantic search functionality
+    """
+
    @abc.abstractmethod
    def semantic_retrieval(
        self,
@@ -124,10 +261,31 @@ class VectorCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
+        """
+        Run vector/semantic search and return a list of inference chunks.
+
+        Parameters:
+        - query: unmodified user query. This is needed for getting the matching highlighted
+                keywords
+        - query_embedding: vector representation of the query, must be of the correct
+                dimensionality for the primary index
+        - filters: standard filter object
+        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
+                based on the persona settings, will have this be a 2x or 3x of the default
+        - num_to_retrieve: number of highest matching chunks to return
+        - offset: number of highest matching chunks to skip (kind of like pagination)
+
+        Returns:
+            best matching chunks based on vector similarity
+        """
        raise NotImplementedError


 class HybridCapable(abc.ABC):
+    """
+    Class must implement hybrid (keyword + vector) search functionality
+    """
+
    @abc.abstractmethod
    def hybrid_retrieval(
        self,
@@ -139,10 +297,48 @@ class HybridCapable(abc.ABC):
        offset: int = 0,
        hybrid_alpha: float | None = None,
    ) -> list[InferenceChunk]:
+        """
+        Run hybrid search and return a list of inference chunks.
+
+        NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
+        expected to be handled by this function as it may depend on the index implementation.
+        Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
+        done here.
+
+        Parameters:
+        - query: unmodified user query. This is needed for getting the matching highlighted
+                keywords
+        - query_embedding: vector representation of the query, must be of the correct
+                dimensionality for the primary index
+        - filters: standard filter object
+        - time_decay_multiplier: how much to decay the document scores as they age. Some queries
+                based on the persona settings, will have this be a 2x or 3x of the default
+        - num_to_retrieve: number of highest matching chunks to return
+        - offset: number of highest matching chunks to skip (kind of like pagination)
+        - hybrid_alpha: weighting between the keyword and vector search results. It is important
+                that the two scores are normalized to the same range so that a meaningful
+                comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
+                on keyword score.
+
+        Returns:
+            best matching chunks based on weighted sum of keyword and vector/semantic search scores
+        """
        raise NotImplementedError


 class AdminCapable(abc.ABC):
+    """
+    Class must implement a search for the admin "Explorer" page. The assumption here is that the
+    admin is not "searching" for knowledge but has some document already in mind. They are either
+    looking to positively boost it because they know it's a good reference document, looking to
+    negatively boost it as a way of "deprecating", or hiding the document.
+
+    Assuming the admin knows the document name, this search has high emphasis on the title match.
+
+    Suggested implementation:
+    Keyword only, BM25 search with 5x weighting on the title field compared to the contents
+    """
+
    @abc.abstractmethod
    def admin_retrieval(
        self,
@@ -151,34 +347,46 @@ class AdminCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
+        """
+        Run the special search for the admin document explorer page
+
+        Parameters:
+        - query: unmodified user query. Though in this flow probably unmodified is best
+        - filters: standard filter object
+        - num_to_retrieve: number of highest matching chunks to return
+        - offset: number of highest matching chunks to skip (kind of like pagination)
+
+        Returns:
+            list of best matching chunks for the explorer page query
+        """
        raise NotImplementedError


 class BaseIndex(
    Verifiable,
-    AdminCapable,
-    IdRetrievalCapable,
    Indexable,
    Updatable,
    Deletable,
+    AdminCapable,
+    IdRetrievalCapable,
    abc.ABC,
 ):
-    """All basic functionalities excluding a specific retrieval approach
-    Indices need to be able to
-    - Check that the index exists with a schema definition
-    - Can index documents
-    - Can delete documents
-    - Can update document metadata (such as access permissions and document specific boost)
+    """
+    All basic document index functionalities excluding the actual querying approach.
+
+    As a summary, document indices need to be able to
+    - Verify the schema definition is valid
+    - Index new documents
+    - Update specific attributes of existing documents
+    - Delete documents
+    - Provide a search for the admin document explorer page
+    - Retrieve documents based on document id
    """


-class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC):
-    pass
-
-
-class VectorIndex(VectorCapable, BaseIndex, abc.ABC):
-    pass
-
-
 class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
-    pass
+    """
+    A valid document index that can plug into all Danswer flows must implement all of these
+    functionalities, though "technically" it does not need to be keyword or vector capable as
+    currently all default search flows use Hybrid Search.
+    """
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex):
    def delete(self, doc_ids: list[str]) -> None:
        logger.info(f"Deleting {len(doc_ids)} documents from Vespa")

-        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
+        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
        # indexing / updates / deletes since we have to make a large volume of requests.
        with httpx.Client(http2=True) as http_client:
            index_names = [self.index_name]
@@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex):
                for vespa_chunk_id in vespa_chunk_ids
            ]

-            logger.debug(
-                "Running LLM usefulness eval in parallel (following logging may be out of order)"
-            )
            inference_chunks = run_functions_tuples_in_parallel(
                functions_with_args, allow_failures=True
            )