Better description of the document index interfaces (#1188)

This commit is contained in:
Yuhong Sun
2024-03-06 00:07:12 -08:00
committed by GitHub
parent 2ace03081c
commit 3f1cd1ad12
2 changed files with 232 additions and 27 deletions

View File

@@ -17,6 +17,11 @@ class DocumentInsertionRecord:
@dataclass
class DocumentMetadata:
"""
Document information that needs to be inserted into Postgres on first time encountering this
document during indexing across any of the connectors.
"""
connector_id: int
credential_id: int
document_id: str
@@ -32,11 +37,13 @@ class DocumentMetadata:
@dataclass
class UpdateRequest:
"""For all document_ids, update the allowed_users and the boost to the new value
ignore if None"""
"""
For all document_ids, update the allowed_users and the boost to the new values
Does not update any of the None fields
"""
document_ids: list[str]
# all other fields will be left alone
# all other fields except these 4 will always be left alone by the update request
access: DocumentAccess | None = None
document_sets: set[str] | None = None
boost: float | None = None
@@ -44,6 +51,18 @@ class UpdateRequest:
class Verifiable(abc.ABC):
"""
Class must implement document index schema verification. For example, verify that all of the
necessary attributes for indexing, querying, filtering, and fields to return from search are
all valid in the schema.
Parameters:
- index_name: The name of the primary index currently used for querying
- secondary_index_name: The name of the secondary index being built in the background, if it
currently exists. Some functions on the document index act on both the primary and
secondary index, some act on just one.
"""
@abc.abstractmethod
def __init__(
self,
@@ -62,34 +81,104 @@ class Verifiable(abc.ABC):
index_embedding_dim: int,
secondary_index_embedding_dim: int | None,
) -> None:
"""
Verify that the document index exists and is consistent with the expectations in the code.
Parameters:
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
behind the scenes. The secondary index should only be built when switching
embedding models therefore this dim should be different from the primary index.
"""
raise NotImplementedError
class Indexable(abc.ABC):
"""
Class must implement the ability to index document chunks
"""
@abc.abstractmethod
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
) -> set[DocumentInsertionRecord]:
"""Indexes document chunks into the Document Index and return the IDs of all the documents indexed"""
"""
Takes a list of document chunks and indexes them in the document index
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
chunks before reindexing. This is because the document may have gotten shorter since the
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
have not been written over.
NOTE: The chunks of a document are never separated into separate index() calls. So there is
no worry of receiving the first 0 through n chunks in one index call and the next n through
m chunks of a docu in the next index call.
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
it is done automatically outside of this code.
Parameters:
- chunks: Document chunks with all of the information needed for indexing to the document
index.
Returns:
List of document ids which map to unique documents and are used for deduping chunks
when updating, as well as if the document is newly indexed or already existed and
just updated
"""
raise NotImplementedError
class Deletable(abc.ABC):
"""
Class must implement the ability to delete document by their unique document ids.
"""
@abc.abstractmethod
def delete(self, doc_ids: list[str]) -> None:
"""Removes the specified documents from the Index"""
"""
Given a list of document ids, hard delete them from the document index
Parameters:
- doc_ids: list of document ids as specified by the connector
"""
raise NotImplementedError
class Updatable(abc.ABC):
"""
Class must implement the ability to update certain attributes of a document without needing to
update all of the fields. Specifically, needs to be able to update:
- Access Control List
- Document-set membership
- Boost value (learning from feedback mechanism)
- Whether the document is hidden or not, hidden documents are not returned from search
"""
@abc.abstractmethod
def update(self, update_requests: list[UpdateRequest]) -> None:
"""Updates metadata for the specified documents sets in the Index"""
"""
Updates some set of chunks. The document and fields to update are specified in the update
requests. Each update request in the list applies its changes to a list of document ids.
None values mean that the field does not need an update.
Parameters:
- update_requests: for a list of document ids in the update request, apply the same updates
to all of the documents with those ids. This is for bulk handling efficiency. Many
updates are done at the connector level which have many documents for the connector
"""
raise NotImplementedError
class IdRetrievalCapable(abc.ABC):
"""
Class must implement the ability to retrieve either:
- all of the chunks of a document IN ORDER given a document id.
- a specific chunk given a document id and a chunk index (0 based)
"""
@abc.abstractmethod
def id_based_retrieval(
self,
@@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC):
chunk_ind: int | None,
filters: IndexFilters,
) -> list[InferenceChunk]:
"""
Fetch chunk(s) based on document id
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
of a document. Downstream currently assumes that the chunking does not introduce overlaps
between the chunks. If there are overlaps for the chunks, then the reconstructed document
or extended section will have duplicate segments.
Parameters:
- document_id: document id for which to retrieve the chunk(s)
- chunk_ind: chunk index to return, if None, return all of the chunks in order
- filters: standard filters object, in this case only the access filter is applied as a
permission check
Returns:
list of chunks for the document id or the specific chunk by the specified chunk index
and document id
"""
raise NotImplementedError
class KeywordCapable(abc.ABC):
"""
Class must implement the keyword search functionality
"""
@abc.abstractmethod
def keyword_retrieval(
self,
@@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC):
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
"""
Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
information required for query time purposes. For example, some details of the document
required at indexing time are no longer needed past this point. At the same time, the
matching keywords need to be highlighted.
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
expected to be handled by this function as it may depend on the index implementation.
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
done here.
Parameters:
- query: unmodified user query
- filters: standard filter object
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
based on the persona settings, will have this be a 2x or 3x of the default
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
Returns:
best matching chunks based on keyword matching (should be BM25 algorithm ideally)
"""
raise NotImplementedError
class VectorCapable(abc.ABC):
"""
Class must implement the vector/semantic search functionality
"""
@abc.abstractmethod
def semantic_retrieval(
self,
@@ -124,10 +261,31 @@ class VectorCapable(abc.ABC):
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
"""
Run vector/semantic search and return a list of inference chunks.
Parameters:
- query: unmodified user query. This is needed for getting the matching highlighted
keywords
- query_embedding: vector representation of the query, must be of the correct
dimensionality for the primary index
- filters: standard filter object
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
based on the persona settings, will have this be a 2x or 3x of the default
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
Returns:
best matching chunks based on vector similarity
"""
raise NotImplementedError
class HybridCapable(abc.ABC):
"""
Class must implement hybrid (keyword + vector) search functionality
"""
@abc.abstractmethod
def hybrid_retrieval(
self,
@@ -139,10 +297,48 @@ class HybridCapable(abc.ABC):
offset: int = 0,
hybrid_alpha: float | None = None,
) -> list[InferenceChunk]:
"""
Run hybrid search and return a list of inference chunks.
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
expected to be handled by this function as it may depend on the index implementation.
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
done here.
Parameters:
- query: unmodified user query. This is needed for getting the matching highlighted
keywords
- query_embedding: vector representation of the query, must be of the correct
dimensionality for the primary index
- filters: standard filter object
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
based on the persona settings, will have this be a 2x or 3x of the default
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
- hybrid_alpha: weighting between the keyword and vector search results. It is important
that the two scores are normalized to the same range so that a meaningful
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
on keyword score.
Returns:
best matching chunks based on weighted sum of keyword and vector/semantic search scores
"""
raise NotImplementedError
class AdminCapable(abc.ABC):
"""
Class must implement a search for the admin "Explorer" page. The assumption here is that the
admin is not "searching" for knowledge but has some document already in mind. They are either
looking to positively boost it because they know it's a good reference document, looking to
negatively boost it as a way of "deprecating", or hiding the document.
Assuming the admin knows the document name, this search has high emphasis on the title match.
Suggested implementation:
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
"""
@abc.abstractmethod
def admin_retrieval(
self,
@@ -151,34 +347,46 @@ class AdminCapable(abc.ABC):
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
"""
Run the special search for the admin document explorer page
Parameters:
- query: unmodified user query. Though in this flow probably unmodified is best
- filters: standard filter object
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
Returns:
list of best matching chunks for the explorer page query
"""
raise NotImplementedError
class BaseIndex(
Verifiable,
AdminCapable,
IdRetrievalCapable,
Indexable,
Updatable,
Deletable,
AdminCapable,
IdRetrievalCapable,
abc.ABC,
):
"""All basic functionalities excluding a specific retrieval approach
Indices need to be able to
- Check that the index exists with a schema definition
- Can index documents
- Can delete documents
- Can update document metadata (such as access permissions and document specific boost)
"""
All basic document index functionalities excluding the actual querying approach.
As a summary, document indices need to be able to
- Verify the schema definition is valid
- Index new documents
- Update specific attributes of existing documents
- Delete documents
- Provide a search for the admin document explorer page
- Retrieve documents based on document id
"""
class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC):
pass
class VectorIndex(VectorCapable, BaseIndex, abc.ABC):
pass
class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
pass
"""
A valid document index that can plug into all Danswer flows must implement all of these
functionalities, though "technically" it does not need to be keyword or vector capable as
currently all default search flows use Hybrid Search.
"""

View File

@@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex):
def delete(self, doc_ids: list[str]) -> None:
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
with httpx.Client(http2=True) as http_client:
index_names = [self.index_name]
@@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex):
for vespa_chunk_id in vespa_chunk_ids
]
logger.debug(
"Running LLM usefulness eval in parallel (following logging may be out of order)"
)
inference_chunks = run_functions_tuples_in_parallel(
functions_with_args, allow_failures=True
)