mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-25 19:37:29 +02:00
Better description of the document index interfaces (#1188)
This commit is contained in:
@@ -17,6 +17,11 @@ class DocumentInsertionRecord:
|
||||
|
||||
@dataclass
|
||||
class DocumentMetadata:
|
||||
"""
|
||||
Document information that needs to be inserted into Postgres on first time encountering this
|
||||
document during indexing across any of the connectors.
|
||||
"""
|
||||
|
||||
connector_id: int
|
||||
credential_id: int
|
||||
document_id: str
|
||||
@@ -32,11 +37,13 @@ class DocumentMetadata:
|
||||
|
||||
@dataclass
|
||||
class UpdateRequest:
|
||||
"""For all document_ids, update the allowed_users and the boost to the new value
|
||||
ignore if None"""
|
||||
"""
|
||||
For all document_ids, update the allowed_users and the boost to the new values
|
||||
Does not update any of the None fields
|
||||
"""
|
||||
|
||||
document_ids: list[str]
|
||||
# all other fields will be left alone
|
||||
# all other fields except these 4 will always be left alone by the update request
|
||||
access: DocumentAccess | None = None
|
||||
document_sets: set[str] | None = None
|
||||
boost: float | None = None
|
||||
@@ -44,6 +51,18 @@ class UpdateRequest:
|
||||
|
||||
|
||||
class Verifiable(abc.ABC):
|
||||
"""
|
||||
Class must implement document index schema verification. For example, verify that all of the
|
||||
necessary attributes for indexing, querying, filtering, and fields to return from search are
|
||||
all valid in the schema.
|
||||
|
||||
Parameters:
|
||||
- index_name: The name of the primary index currently used for querying
|
||||
- secondary_index_name: The name of the secondary index being built in the background, if it
|
||||
currently exists. Some functions on the document index act on both the primary and
|
||||
secondary index, some act on just one.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
@@ -62,34 +81,104 @@ class Verifiable(abc.ABC):
|
||||
index_embedding_dim: int,
|
||||
secondary_index_embedding_dim: int | None,
|
||||
) -> None:
|
||||
"""
|
||||
Verify that the document index exists and is consistent with the expectations in the code.
|
||||
|
||||
Parameters:
|
||||
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
|
||||
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
|
||||
behind the scenes. The secondary index should only be built when switching
|
||||
embedding models therefore this dim should be different from the primary index.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Indexable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to index document chunks
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def index(
|
||||
self,
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
) -> set[DocumentInsertionRecord]:
|
||||
"""Indexes document chunks into the Document Index and return the IDs of all the documents indexed"""
|
||||
"""
|
||||
Takes a list of document chunks and indexes them in the document index
|
||||
|
||||
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
|
||||
chunks before reindexing. This is because the document may have gotten shorter since the
|
||||
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
|
||||
have not been written over.
|
||||
|
||||
NOTE: The chunks of a document are never separated into separate index() calls. So there is
|
||||
no worry of receiving the first 0 through n chunks in one index call and the next n through
|
||||
m chunks of a docu in the next index call.
|
||||
|
||||
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
|
||||
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
||||
it is done automatically outside of this code.
|
||||
|
||||
Parameters:
|
||||
- chunks: Document chunks with all of the information needed for indexing to the document
|
||||
index.
|
||||
|
||||
Returns:
|
||||
List of document ids which map to unique documents and are used for deduping chunks
|
||||
when updating, as well as if the document is newly indexed or already existed and
|
||||
just updated
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Deletable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to delete document by their unique document ids.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete(self, doc_ids: list[str]) -> None:
|
||||
"""Removes the specified documents from the Index"""
|
||||
"""
|
||||
Given a list of document ids, hard delete them from the document index
|
||||
|
||||
Parameters:
|
||||
- doc_ids: list of document ids as specified by the connector
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Updatable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to update certain attributes of a document without needing to
|
||||
update all of the fields. Specifically, needs to be able to update:
|
||||
- Access Control List
|
||||
- Document-set membership
|
||||
- Boost value (learning from feedback mechanism)
|
||||
- Whether the document is hidden or not, hidden documents are not returned from search
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
"""Updates metadata for the specified documents sets in the Index"""
|
||||
"""
|
||||
Updates some set of chunks. The document and fields to update are specified in the update
|
||||
requests. Each update request in the list applies its changes to a list of document ids.
|
||||
None values mean that the field does not need an update.
|
||||
|
||||
Parameters:
|
||||
- update_requests: for a list of document ids in the update request, apply the same updates
|
||||
to all of the documents with those ids. This is for bulk handling efficiency. Many
|
||||
updates are done at the connector level which have many documents for the connector
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class IdRetrievalCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to retrieve either:
|
||||
- all of the chunks of a document IN ORDER given a document id.
|
||||
- a specific chunk given a document id and a chunk index (0 based)
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def id_based_retrieval(
|
||||
self,
|
||||
@@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC):
|
||||
chunk_ind: int | None,
|
||||
filters: IndexFilters,
|
||||
) -> list[InferenceChunk]:
|
||||
"""
|
||||
Fetch chunk(s) based on document id
|
||||
|
||||
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
|
||||
of a document. Downstream currently assumes that the chunking does not introduce overlaps
|
||||
between the chunks. If there are overlaps for the chunks, then the reconstructed document
|
||||
or extended section will have duplicate segments.
|
||||
|
||||
Parameters:
|
||||
- document_id: document id for which to retrieve the chunk(s)
|
||||
- chunk_ind: chunk index to return, if None, return all of the chunks in order
|
||||
- filters: standard filters object, in this case only the access filter is applied as a
|
||||
permission check
|
||||
|
||||
Returns:
|
||||
list of chunks for the document id or the specific chunk by the specified chunk index
|
||||
and document id
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class KeywordCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement the keyword search functionality
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def keyword_retrieval(
|
||||
self,
|
||||
@@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC):
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
"""
|
||||
Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
|
||||
information required for query time purposes. For example, some details of the document
|
||||
required at indexing time are no longer needed past this point. At the same time, the
|
||||
matching keywords need to be highlighted.
|
||||
|
||||
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
||||
expected to be handled by this function as it may depend on the index implementation.
|
||||
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
||||
done here.
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query
|
||||
- filters: standard filter object
|
||||
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||
based on the persona settings, will have this be a 2x or 3x of the default
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
|
||||
Returns:
|
||||
best matching chunks based on keyword matching (should be BM25 algorithm ideally)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class VectorCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement the vector/semantic search functionality
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def semantic_retrieval(
|
||||
self,
|
||||
@@ -124,10 +261,31 @@ class VectorCapable(abc.ABC):
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
"""
|
||||
Run vector/semantic search and return a list of inference chunks.
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query. This is needed for getting the matching highlighted
|
||||
keywords
|
||||
- query_embedding: vector representation of the query, must be of the correct
|
||||
dimensionality for the primary index
|
||||
- filters: standard filter object
|
||||
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||
based on the persona settings, will have this be a 2x or 3x of the default
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
|
||||
Returns:
|
||||
best matching chunks based on vector similarity
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HybridCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement hybrid (keyword + vector) search functionality
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def hybrid_retrieval(
|
||||
self,
|
||||
@@ -139,10 +297,48 @@ class HybridCapable(abc.ABC):
|
||||
offset: int = 0,
|
||||
hybrid_alpha: float | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
"""
|
||||
Run hybrid search and return a list of inference chunks.
|
||||
|
||||
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
||||
expected to be handled by this function as it may depend on the index implementation.
|
||||
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
||||
done here.
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query. This is needed for getting the matching highlighted
|
||||
keywords
|
||||
- query_embedding: vector representation of the query, must be of the correct
|
||||
dimensionality for the primary index
|
||||
- filters: standard filter object
|
||||
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||
based on the persona settings, will have this be a 2x or 3x of the default
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
- hybrid_alpha: weighting between the keyword and vector search results. It is important
|
||||
that the two scores are normalized to the same range so that a meaningful
|
||||
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
|
||||
on keyword score.
|
||||
|
||||
Returns:
|
||||
best matching chunks based on weighted sum of keyword and vector/semantic search scores
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AdminCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement a search for the admin "Explorer" page. The assumption here is that the
|
||||
admin is not "searching" for knowledge but has some document already in mind. They are either
|
||||
looking to positively boost it because they know it's a good reference document, looking to
|
||||
negatively boost it as a way of "deprecating", or hiding the document.
|
||||
|
||||
Assuming the admin knows the document name, this search has high emphasis on the title match.
|
||||
|
||||
Suggested implementation:
|
||||
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def admin_retrieval(
|
||||
self,
|
||||
@@ -151,34 +347,46 @@ class AdminCapable(abc.ABC):
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
"""
|
||||
Run the special search for the admin document explorer page
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query. Though in this flow probably unmodified is best
|
||||
- filters: standard filter object
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
|
||||
Returns:
|
||||
list of best matching chunks for the explorer page query
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseIndex(
|
||||
Verifiable,
|
||||
AdminCapable,
|
||||
IdRetrievalCapable,
|
||||
Indexable,
|
||||
Updatable,
|
||||
Deletable,
|
||||
AdminCapable,
|
||||
IdRetrievalCapable,
|
||||
abc.ABC,
|
||||
):
|
||||
"""All basic functionalities excluding a specific retrieval approach
|
||||
Indices need to be able to
|
||||
- Check that the index exists with a schema definition
|
||||
- Can index documents
|
||||
- Can delete documents
|
||||
- Can update document metadata (such as access permissions and document specific boost)
|
||||
"""
|
||||
All basic document index functionalities excluding the actual querying approach.
|
||||
|
||||
As a summary, document indices need to be able to
|
||||
- Verify the schema definition is valid
|
||||
- Index new documents
|
||||
- Update specific attributes of existing documents
|
||||
- Delete documents
|
||||
- Provide a search for the admin document explorer page
|
||||
- Retrieve documents based on document id
|
||||
"""
|
||||
|
||||
|
||||
class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC):
|
||||
pass
|
||||
|
||||
|
||||
class VectorIndex(VectorCapable, BaseIndex, abc.ABC):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
|
||||
pass
|
||||
"""
|
||||
A valid document index that can plug into all Danswer flows must implement all of these
|
||||
functionalities, though "technically" it does not need to be keyword or vector capable as
|
||||
currently all default search flows use Hybrid Search.
|
||||
"""
|
||||
|
@@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex):
|
||||
def delete(self, doc_ids: list[str]) -> None:
|
||||
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
with httpx.Client(http2=True) as http_client:
|
||||
index_names = [self.index_name]
|
||||
@@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex):
|
||||
for vespa_chunk_id in vespa_chunk_ids
|
||||
]
|
||||
|
||||
logger.debug(
|
||||
"Running LLM usefulness eval in parallel (following logging may be out of order)"
|
||||
)
|
||||
inference_chunks = run_functions_tuples_in_parallel(
|
||||
functions_with_args, allow_failures=True
|
||||
)
|
||||
|
Reference in New Issue
Block a user