mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-26 11:58:28 +02:00
Better description of the document index interfaces (#1188)
This commit is contained in:
@@ -17,6 +17,11 @@ class DocumentInsertionRecord:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DocumentMetadata:
|
class DocumentMetadata:
|
||||||
|
"""
|
||||||
|
Document information that needs to be inserted into Postgres on first time encountering this
|
||||||
|
document during indexing across any of the connectors.
|
||||||
|
"""
|
||||||
|
|
||||||
connector_id: int
|
connector_id: int
|
||||||
credential_id: int
|
credential_id: int
|
||||||
document_id: str
|
document_id: str
|
||||||
@@ -32,11 +37,13 @@ class DocumentMetadata:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class UpdateRequest:
|
class UpdateRequest:
|
||||||
"""For all document_ids, update the allowed_users and the boost to the new value
|
"""
|
||||||
ignore if None"""
|
For all document_ids, update the allowed_users and the boost to the new values
|
||||||
|
Does not update any of the None fields
|
||||||
|
"""
|
||||||
|
|
||||||
document_ids: list[str]
|
document_ids: list[str]
|
||||||
# all other fields will be left alone
|
# all other fields except these 4 will always be left alone by the update request
|
||||||
access: DocumentAccess | None = None
|
access: DocumentAccess | None = None
|
||||||
document_sets: set[str] | None = None
|
document_sets: set[str] | None = None
|
||||||
boost: float | None = None
|
boost: float | None = None
|
||||||
@@ -44,6 +51,18 @@ class UpdateRequest:
|
|||||||
|
|
||||||
|
|
||||||
class Verifiable(abc.ABC):
|
class Verifiable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement document index schema verification. For example, verify that all of the
|
||||||
|
necessary attributes for indexing, querying, filtering, and fields to return from search are
|
||||||
|
all valid in the schema.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- index_name: The name of the primary index currently used for querying
|
||||||
|
- secondary_index_name: The name of the secondary index being built in the background, if it
|
||||||
|
currently exists. Some functions on the document index act on both the primary and
|
||||||
|
secondary index, some act on just one.
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -62,34 +81,104 @@ class Verifiable(abc.ABC):
|
|||||||
index_embedding_dim: int,
|
index_embedding_dim: int,
|
||||||
secondary_index_embedding_dim: int | None,
|
secondary_index_embedding_dim: int | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
Verify that the document index exists and is consistent with the expectations in the code.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
|
||||||
|
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
|
||||||
|
behind the scenes. The secondary index should only be built when switching
|
||||||
|
embedding models therefore this dim should be different from the primary index.
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Indexable(abc.ABC):
|
class Indexable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the ability to index document chunks
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def index(
|
def index(
|
||||||
self,
|
self,
|
||||||
chunks: list[DocMetadataAwareIndexChunk],
|
chunks: list[DocMetadataAwareIndexChunk],
|
||||||
) -> set[DocumentInsertionRecord]:
|
) -> set[DocumentInsertionRecord]:
|
||||||
"""Indexes document chunks into the Document Index and return the IDs of all the documents indexed"""
|
"""
|
||||||
|
Takes a list of document chunks and indexes them in the document index
|
||||||
|
|
||||||
|
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
|
||||||
|
chunks before reindexing. This is because the document may have gotten shorter since the
|
||||||
|
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
|
||||||
|
have not been written over.
|
||||||
|
|
||||||
|
NOTE: The chunks of a document are never separated into separate index() calls. So there is
|
||||||
|
no worry of receiving the first 0 through n chunks in one index call and the next n through
|
||||||
|
m chunks of a docu in the next index call.
|
||||||
|
|
||||||
|
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
|
||||||
|
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
||||||
|
it is done automatically outside of this code.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- chunks: Document chunks with all of the information needed for indexing to the document
|
||||||
|
index.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of document ids which map to unique documents and are used for deduping chunks
|
||||||
|
when updating, as well as if the document is newly indexed or already existed and
|
||||||
|
just updated
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Deletable(abc.ABC):
|
class Deletable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the ability to delete document by their unique document ids.
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def delete(self, doc_ids: list[str]) -> None:
|
def delete(self, doc_ids: list[str]) -> None:
|
||||||
"""Removes the specified documents from the Index"""
|
"""
|
||||||
|
Given a list of document ids, hard delete them from the document index
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- doc_ids: list of document ids as specified by the connector
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Updatable(abc.ABC):
|
class Updatable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the ability to update certain attributes of a document without needing to
|
||||||
|
update all of the fields. Specifically, needs to be able to update:
|
||||||
|
- Access Control List
|
||||||
|
- Document-set membership
|
||||||
|
- Boost value (learning from feedback mechanism)
|
||||||
|
- Whether the document is hidden or not, hidden documents are not returned from search
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||||
"""Updates metadata for the specified documents sets in the Index"""
|
"""
|
||||||
|
Updates some set of chunks. The document and fields to update are specified in the update
|
||||||
|
requests. Each update request in the list applies its changes to a list of document ids.
|
||||||
|
None values mean that the field does not need an update.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- update_requests: for a list of document ids in the update request, apply the same updates
|
||||||
|
to all of the documents with those ids. This is for bulk handling efficiency. Many
|
||||||
|
updates are done at the connector level which have many documents for the connector
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class IdRetrievalCapable(abc.ABC):
|
class IdRetrievalCapable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the ability to retrieve either:
|
||||||
|
- all of the chunks of a document IN ORDER given a document id.
|
||||||
|
- a specific chunk given a document id and a chunk index (0 based)
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def id_based_retrieval(
|
def id_based_retrieval(
|
||||||
self,
|
self,
|
||||||
@@ -97,10 +186,32 @@ class IdRetrievalCapable(abc.ABC):
|
|||||||
chunk_ind: int | None,
|
chunk_ind: int | None,
|
||||||
filters: IndexFilters,
|
filters: IndexFilters,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
|
"""
|
||||||
|
Fetch chunk(s) based on document id
|
||||||
|
|
||||||
|
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
|
||||||
|
of a document. Downstream currently assumes that the chunking does not introduce overlaps
|
||||||
|
between the chunks. If there are overlaps for the chunks, then the reconstructed document
|
||||||
|
or extended section will have duplicate segments.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- document_id: document id for which to retrieve the chunk(s)
|
||||||
|
- chunk_ind: chunk index to return, if None, return all of the chunks in order
|
||||||
|
- filters: standard filters object, in this case only the access filter is applied as a
|
||||||
|
permission check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of chunks for the document id or the specific chunk by the specified chunk index
|
||||||
|
and document id
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class KeywordCapable(abc.ABC):
|
class KeywordCapable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the keyword search functionality
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def keyword_retrieval(
|
def keyword_retrieval(
|
||||||
self,
|
self,
|
||||||
@@ -110,10 +221,36 @@ class KeywordCapable(abc.ABC):
|
|||||||
num_to_retrieve: int,
|
num_to_retrieve: int,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
|
"""
|
||||||
|
Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
|
||||||
|
information required for query time purposes. For example, some details of the document
|
||||||
|
required at indexing time are no longer needed past this point. At the same time, the
|
||||||
|
matching keywords need to be highlighted.
|
||||||
|
|
||||||
|
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
||||||
|
expected to be handled by this function as it may depend on the index implementation.
|
||||||
|
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
||||||
|
done here.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- query: unmodified user query
|
||||||
|
- filters: standard filter object
|
||||||
|
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||||
|
based on the persona settings, will have this be a 2x or 3x of the default
|
||||||
|
- num_to_retrieve: number of highest matching chunks to return
|
||||||
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
best matching chunks based on keyword matching (should be BM25 algorithm ideally)
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class VectorCapable(abc.ABC):
|
class VectorCapable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement the vector/semantic search functionality
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def semantic_retrieval(
|
def semantic_retrieval(
|
||||||
self,
|
self,
|
||||||
@@ -124,10 +261,31 @@ class VectorCapable(abc.ABC):
|
|||||||
num_to_retrieve: int,
|
num_to_retrieve: int,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
|
"""
|
||||||
|
Run vector/semantic search and return a list of inference chunks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- query: unmodified user query. This is needed for getting the matching highlighted
|
||||||
|
keywords
|
||||||
|
- query_embedding: vector representation of the query, must be of the correct
|
||||||
|
dimensionality for the primary index
|
||||||
|
- filters: standard filter object
|
||||||
|
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||||
|
based on the persona settings, will have this be a 2x or 3x of the default
|
||||||
|
- num_to_retrieve: number of highest matching chunks to return
|
||||||
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
best matching chunks based on vector similarity
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class HybridCapable(abc.ABC):
|
class HybridCapable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement hybrid (keyword + vector) search functionality
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def hybrid_retrieval(
|
def hybrid_retrieval(
|
||||||
self,
|
self,
|
||||||
@@ -139,10 +297,48 @@ class HybridCapable(abc.ABC):
|
|||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
hybrid_alpha: float | None = None,
|
hybrid_alpha: float | None = None,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
|
"""
|
||||||
|
Run hybrid search and return a list of inference chunks.
|
||||||
|
|
||||||
|
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
||||||
|
expected to be handled by this function as it may depend on the index implementation.
|
||||||
|
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
||||||
|
done here.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- query: unmodified user query. This is needed for getting the matching highlighted
|
||||||
|
keywords
|
||||||
|
- query_embedding: vector representation of the query, must be of the correct
|
||||||
|
dimensionality for the primary index
|
||||||
|
- filters: standard filter object
|
||||||
|
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||||
|
based on the persona settings, will have this be a 2x or 3x of the default
|
||||||
|
- num_to_retrieve: number of highest matching chunks to return
|
||||||
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||||
|
- hybrid_alpha: weighting between the keyword and vector search results. It is important
|
||||||
|
that the two scores are normalized to the same range so that a meaningful
|
||||||
|
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
|
||||||
|
on keyword score.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
best matching chunks based on weighted sum of keyword and vector/semantic search scores
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class AdminCapable(abc.ABC):
|
class AdminCapable(abc.ABC):
|
||||||
|
"""
|
||||||
|
Class must implement a search for the admin "Explorer" page. The assumption here is that the
|
||||||
|
admin is not "searching" for knowledge but has some document already in mind. They are either
|
||||||
|
looking to positively boost it because they know it's a good reference document, looking to
|
||||||
|
negatively boost it as a way of "deprecating", or hiding the document.
|
||||||
|
|
||||||
|
Assuming the admin knows the document name, this search has high emphasis on the title match.
|
||||||
|
|
||||||
|
Suggested implementation:
|
||||||
|
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
|
||||||
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def admin_retrieval(
|
def admin_retrieval(
|
||||||
self,
|
self,
|
||||||
@@ -151,34 +347,46 @@ class AdminCapable(abc.ABC):
|
|||||||
num_to_retrieve: int,
|
num_to_retrieve: int,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
|
"""
|
||||||
|
Run the special search for the admin document explorer page
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- query: unmodified user query. Though in this flow probably unmodified is best
|
||||||
|
- filters: standard filter object
|
||||||
|
- num_to_retrieve: number of highest matching chunks to return
|
||||||
|
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of best matching chunks for the explorer page query
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class BaseIndex(
|
class BaseIndex(
|
||||||
Verifiable,
|
Verifiable,
|
||||||
AdminCapable,
|
|
||||||
IdRetrievalCapable,
|
|
||||||
Indexable,
|
Indexable,
|
||||||
Updatable,
|
Updatable,
|
||||||
Deletable,
|
Deletable,
|
||||||
|
AdminCapable,
|
||||||
|
IdRetrievalCapable,
|
||||||
abc.ABC,
|
abc.ABC,
|
||||||
):
|
):
|
||||||
"""All basic functionalities excluding a specific retrieval approach
|
"""
|
||||||
Indices need to be able to
|
All basic document index functionalities excluding the actual querying approach.
|
||||||
- Check that the index exists with a schema definition
|
|
||||||
- Can index documents
|
As a summary, document indices need to be able to
|
||||||
- Can delete documents
|
- Verify the schema definition is valid
|
||||||
- Can update document metadata (such as access permissions and document specific boost)
|
- Index new documents
|
||||||
|
- Update specific attributes of existing documents
|
||||||
|
- Delete documents
|
||||||
|
- Provide a search for the admin document explorer page
|
||||||
|
- Retrieve documents based on document id
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class KeywordIndex(KeywordCapable, BaseIndex, abc.ABC):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class VectorIndex(VectorCapable, BaseIndex, abc.ABC):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
|
class DocumentIndex(KeywordCapable, VectorCapable, HybridCapable, BaseIndex, abc.ABC):
|
||||||
pass
|
"""
|
||||||
|
A valid document index that can plug into all Danswer flows must implement all of these
|
||||||
|
functionalities, though "technically" it does not need to be keyword or vector capable as
|
||||||
|
currently all default search flows use Hybrid Search.
|
||||||
|
"""
|
||||||
|
@@ -811,7 +811,7 @@ class VespaIndex(DocumentIndex):
|
|||||||
def delete(self, doc_ids: list[str]) -> None:
|
def delete(self, doc_ids: list[str]) -> None:
|
||||||
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
|
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
|
||||||
|
|
||||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
|
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||||
with httpx.Client(http2=True) as http_client:
|
with httpx.Client(http2=True) as http_client:
|
||||||
index_names = [self.index_name]
|
index_names = [self.index_name]
|
||||||
@@ -844,9 +844,6 @@ class VespaIndex(DocumentIndex):
|
|||||||
for vespa_chunk_id in vespa_chunk_ids
|
for vespa_chunk_id in vespa_chunk_ids
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"Running LLM usefulness eval in parallel (following logging may be out of order)"
|
|
||||||
)
|
|
||||||
inference_chunks = run_functions_tuples_in_parallel(
|
inference_chunks = run_functions_tuples_in_parallel(
|
||||||
functions_with_args, allow_failures=True
|
functions_with_args, allow_failures=True
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user