mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-03 11:11:45 +02:00
welcome to onyx
This commit is contained in:
0
backend/onyx/document_index/__init__.py
Normal file
0
backend/onyx/document_index/__init__.py
Normal file
60
backend/onyx/document_index/document_index_utils.py
Normal file
60
backend/onyx/document_index/document_index_utils.py
Normal file
@ -0,0 +1,60 @@
|
||||
import math
|
||||
import uuid
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.indexing.models import IndexChunk
|
||||
|
||||
|
||||
DEFAULT_BATCH_SIZE = 30
|
||||
DEFAULT_INDEX_NAME = "danswer_chunk"
|
||||
|
||||
|
||||
def get_both_index_names(db_session: Session) -> tuple[str, str | None]:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
|
||||
search_settings_new = get_secondary_search_settings(db_session)
|
||||
if not search_settings_new:
|
||||
return search_settings.index_name, None
|
||||
|
||||
return search_settings.index_name, search_settings_new.index_name
|
||||
|
||||
|
||||
def translate_boost_count_to_multiplier(boost: int) -> float:
|
||||
"""Mapping boost integer values to a multiplier according to a sigmoid curve
|
||||
Piecewise such that at many downvotes, its 0.5x the score and with many upvotes
|
||||
it is 2x the score. This should be in line with the Vespa calculation."""
|
||||
# 3 in the equation below stretches it out to hit asymptotes slower
|
||||
if boost < 0:
|
||||
# 0.5 + sigmoid -> range of 0.5 to 1
|
||||
return 0.5 + (1 / (1 + math.exp(-1 * boost / 3)))
|
||||
|
||||
# 2 x sigmoid -> range of 1 to 2
|
||||
return 2 / (1 + math.exp(-1 * boost / 3))
|
||||
|
||||
|
||||
def get_uuid_from_chunk(
|
||||
chunk: IndexChunk | InferenceChunk, mini_chunk_ind: int = 0
|
||||
) -> uuid.UUID:
|
||||
doc_str = (
|
||||
chunk.document_id
|
||||
if isinstance(chunk, InferenceChunk)
|
||||
else chunk.source_document.id
|
||||
)
|
||||
# Web parsing URL duplicate catching
|
||||
if doc_str and doc_str[-1] == "/":
|
||||
doc_str = doc_str[:-1]
|
||||
unique_identifier_string = "_".join(
|
||||
[doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
|
||||
)
|
||||
if chunk.large_chunk_reference_ids:
|
||||
unique_identifier_string += "_large" + "_".join(
|
||||
[
|
||||
str(referenced_chunk_id)
|
||||
for referenced_chunk_id in chunk.large_chunk_reference_ids
|
||||
]
|
||||
)
|
||||
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
32
backend/onyx/document_index/factory.py
Normal file
32
backend/onyx/document_index/factory.py
Normal file
@ -0,0 +1,32 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
def get_default_document_index(
|
||||
primary_index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
) -> DocumentIndex:
|
||||
"""Primary index is the index that is used for querying/updating etc.
|
||||
Secondary index is for when both the currently used index and the upcoming
|
||||
index both need to be updated, updates are applied to both indices"""
|
||||
# Currently only supporting Vespa
|
||||
return VespaIndex(
|
||||
index_name=primary_index_name,
|
||||
secondary_index_name=secondary_index_name,
|
||||
multitenant=MULTI_TENANT,
|
||||
)
|
||||
|
||||
|
||||
def get_current_primary_default_document_index(db_session: Session) -> DocumentIndex:
|
||||
"""
|
||||
TODO: Use redis to cache this or something
|
||||
"""
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
return get_default_document_index(
|
||||
primary_index_name=search_settings.index_name,
|
||||
secondary_index_name=None,
|
||||
)
|
399
backend/onyx/document_index/interfaces.py
Normal file
399
backend/onyx/document_index/interfaces.py
Normal file
@ -0,0 +1,399 @@
|
||||
import abc
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from onyx.access.models import DocumentAccess
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from shared_configs.model_server_models import Embedding
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DocumentInsertionRecord:
|
||||
document_id: str
|
||||
already_existed: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VespaChunkRequest:
|
||||
document_id: str
|
||||
min_chunk_ind: int | None = None
|
||||
max_chunk_ind: int | None = None
|
||||
|
||||
@property
|
||||
def is_capped(self) -> bool:
|
||||
# If the max chunk index is not None, then the chunk request is capped
|
||||
# If the min chunk index is None, we can assume the min is 0
|
||||
return self.max_chunk_ind is not None
|
||||
|
||||
@property
|
||||
def range(self) -> int | None:
|
||||
if self.max_chunk_ind is not None:
|
||||
return (self.max_chunk_ind - (self.min_chunk_ind or 0)) + 1
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentMetadata:
|
||||
"""
|
||||
Document information that needs to be inserted into Postgres on first time encountering this
|
||||
document during indexing across any of the connectors.
|
||||
"""
|
||||
|
||||
connector_id: int
|
||||
credential_id: int
|
||||
document_id: str
|
||||
semantic_identifier: str
|
||||
first_link: str
|
||||
doc_updated_at: datetime | None = None
|
||||
# Emails, not necessarily attached to users
|
||||
# Users may not be in Onyx
|
||||
primary_owners: list[str] | None = None
|
||||
secondary_owners: list[str] | None = None
|
||||
from_ingestion_api: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class VespaDocumentFields:
|
||||
"""
|
||||
Specifies fields in Vespa for a document. Fields set to None will be ignored.
|
||||
Perhaps we should name this in an implementation agnostic fashion, but it's more
|
||||
understandable like this for now.
|
||||
"""
|
||||
|
||||
# all other fields except these 4 will always be left alone by the update request
|
||||
access: DocumentAccess | None = None
|
||||
document_sets: set[str] | None = None
|
||||
boost: float | None = None
|
||||
hidden: bool | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpdateRequest:
|
||||
"""
|
||||
For all document_ids, update the allowed_users and the boost to the new values
|
||||
Does not update any of the None fields
|
||||
"""
|
||||
|
||||
document_ids: list[str]
|
||||
# all other fields except these 4 will always be left alone by the update request
|
||||
access: DocumentAccess | None = None
|
||||
document_sets: set[str] | None = None
|
||||
boost: float | None = None
|
||||
hidden: bool | None = None
|
||||
|
||||
|
||||
class Verifiable(abc.ABC):
|
||||
"""
|
||||
Class must implement document index schema verification. For example, verify that all of the
|
||||
necessary attributes for indexing, querying, filtering, and fields to return from search are
|
||||
all valid in the schema.
|
||||
|
||||
Parameters:
|
||||
- index_name: The name of the primary index currently used for querying
|
||||
- secondary_index_name: The name of the secondary index being built in the background, if it
|
||||
currently exists. Some functions on the document index act on both the primary and
|
||||
secondary index, some act on just one.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
*args: Any,
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.index_name = index_name
|
||||
self.secondary_index_name = secondary_index_name
|
||||
|
||||
@abc.abstractmethod
|
||||
def ensure_indices_exist(
|
||||
self,
|
||||
index_embedding_dim: int,
|
||||
secondary_index_embedding_dim: int | None,
|
||||
) -> None:
|
||||
"""
|
||||
Verify that the document index exists and is consistent with the expectations in the code.
|
||||
|
||||
Parameters:
|
||||
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
|
||||
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
|
||||
behind the scenes. The secondary index should only be built when switching
|
||||
embedding models therefore this dim should be different from the primary index.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def register_multitenant_indices(
|
||||
indices: list[str],
|
||||
embedding_dims: list[int],
|
||||
) -> None:
|
||||
"""
|
||||
Register multitenant indices with the document index.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Indexable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to index document chunks
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def index(
|
||||
self,
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
fresh_index: bool = False,
|
||||
) -> set[DocumentInsertionRecord]:
|
||||
"""
|
||||
Takes a list of document chunks and indexes them in the document index
|
||||
|
||||
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
|
||||
chunks before reindexing. This is because the document may have gotten shorter since the
|
||||
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
|
||||
have not been written over.
|
||||
|
||||
NOTE: The chunks of a document are never separated into separate index() calls. So there is
|
||||
no worry of receiving the first 0 through n chunks in one index call and the next n through
|
||||
m chunks of a docu in the next index call.
|
||||
|
||||
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
|
||||
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
||||
it is done automatically outside of this code.
|
||||
|
||||
NOTE: The fresh_index parameter, when set to True, assumes no documents have been previously
|
||||
indexed for the given index/tenant. This can be used to optimize the indexing process for
|
||||
new or empty indices.
|
||||
|
||||
Parameters:
|
||||
- chunks: Document chunks with all of the information needed for indexing to the document
|
||||
index.
|
||||
- fresh_index: Boolean indicating whether this is a fresh index with no existing documents.
|
||||
|
||||
Returns:
|
||||
List of document ids which map to unique documents and are used for deduping chunks
|
||||
when updating, as well as if the document is newly indexed or already existed and
|
||||
just updated
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Deletable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to delete document by their unique document ids.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete_single(self, doc_id: str) -> int:
|
||||
"""
|
||||
Given a single document id, hard delete it from the document index
|
||||
|
||||
Parameters:
|
||||
- doc_id: document id as specified by the connector
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete(self, doc_ids: list[str]) -> None:
|
||||
"""
|
||||
Given a list of document ids, hard delete them from the document index
|
||||
|
||||
Parameters:
|
||||
- doc_ids: list of document ids as specified by the connector
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Updatable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to update certain attributes of a document without needing to
|
||||
update all of the fields. Specifically, needs to be able to update:
|
||||
- Access Control List
|
||||
- Document-set membership
|
||||
- Boost value (learning from feedback mechanism)
|
||||
- Whether the document is hidden or not, hidden documents are not returned from search
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
|
||||
"""
|
||||
Updates all chunks for a document with the specified fields.
|
||||
None values mean that the field does not need an update.
|
||||
|
||||
The rationale for a single update function is that it allows retries and parallelism
|
||||
to happen at a higher / more strategic level, is simpler to read, and allows
|
||||
us to individually handle error conditions per document.
|
||||
|
||||
Parameters:
|
||||
- fields: the fields to update in the document. Any field set to None will not be changed.
|
||||
|
||||
Return:
|
||||
None
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
"""
|
||||
Updates some set of chunks. The document and fields to update are specified in the update
|
||||
requests. Each update request in the list applies its changes to a list of document ids.
|
||||
None values mean that the field does not need an update.
|
||||
|
||||
Parameters:
|
||||
- update_requests: for a list of document ids in the update request, apply the same updates
|
||||
to all of the documents with those ids. This is for bulk handling efficiency. Many
|
||||
updates are done at the connector level which have many documents for the connector
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class IdRetrievalCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement the ability to retrieve either:
|
||||
- all of the chunks of a document IN ORDER given a document id.
|
||||
- a specific chunk given a document id and a chunk index (0 based)
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def id_based_retrieval(
|
||||
self,
|
||||
chunk_requests: list[VespaChunkRequest],
|
||||
filters: IndexFilters,
|
||||
batch_retrieval: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Fetch chunk(s) based on document id
|
||||
|
||||
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
|
||||
of a document. Downstream currently assumes that the chunking does not introduce overlaps
|
||||
between the chunks. If there are overlaps for the chunks, then the reconstructed document
|
||||
or extended section will have duplicate segments.
|
||||
|
||||
Parameters:
|
||||
- chunk_requests: requests containing the document id and the chunk range to retrieve
|
||||
- filters: Filters to apply to retrieval
|
||||
- batch_retrieval: If True, perform a batch retrieval
|
||||
|
||||
Returns:
|
||||
list of chunks for the document id or the specific chunk by the specified chunk index
|
||||
and document id
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HybridCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement hybrid (keyword + vector) search functionality
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def hybrid_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: Embedding,
|
||||
final_keywords: list[str] | None,
|
||||
filters: IndexFilters,
|
||||
hybrid_alpha: float,
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run hybrid search and return a list of inference chunks.
|
||||
|
||||
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
|
||||
expected to be handled by this function as it may depend on the index implementation.
|
||||
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
|
||||
done here.
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query. This is needed for getting the matching highlighted
|
||||
keywords
|
||||
- query_embedding: vector representation of the query, must be of the correct
|
||||
dimensionality for the primary index
|
||||
- final_keywords: Final keywords to be used from the query, defaults to query if not set
|
||||
- filters: standard filter object
|
||||
- hybrid_alpha: weighting between the keyword and vector search results. It is important
|
||||
that the two scores are normalized to the same range so that a meaningful
|
||||
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
|
||||
on keyword score.
|
||||
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
|
||||
based on the persona settings, will have this be a 2x or 3x of the default
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
|
||||
Returns:
|
||||
best matching chunks based on weighted sum of keyword and vector/semantic search scores
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AdminCapable(abc.ABC):
|
||||
"""
|
||||
Class must implement a search for the admin "Explorer" page. The assumption here is that the
|
||||
admin is not "searching" for knowledge but has some document already in mind. They are either
|
||||
looking to positively boost it because they know it's a good reference document, looking to
|
||||
negatively boost it as a way of "deprecating", or hiding the document.
|
||||
|
||||
Assuming the admin knows the document name, this search has high emphasis on the title match.
|
||||
|
||||
Suggested implementation:
|
||||
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def admin_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
filters: IndexFilters,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run the special search for the admin document explorer page
|
||||
|
||||
Parameters:
|
||||
- query: unmodified user query. Though in this flow probably unmodified is best
|
||||
- filters: standard filter object
|
||||
- num_to_retrieve: number of highest matching chunks to return
|
||||
- offset: number of highest matching chunks to skip (kind of like pagination)
|
||||
|
||||
Returns:
|
||||
list of best matching chunks for the explorer page query
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseIndex(
|
||||
Verifiable,
|
||||
Indexable,
|
||||
Updatable,
|
||||
Deletable,
|
||||
AdminCapable,
|
||||
IdRetrievalCapable,
|
||||
abc.ABC,
|
||||
):
|
||||
"""
|
||||
All basic document index functionalities excluding the actual querying approach.
|
||||
|
||||
As a summary, document indices need to be able to
|
||||
- Verify the schema definition is valid
|
||||
- Index new documents
|
||||
- Update specific attributes of existing documents
|
||||
- Delete documents
|
||||
- Provide a search for the admin document explorer page
|
||||
- Retrieve documents based on document id
|
||||
"""
|
||||
|
||||
|
||||
class DocumentIndex(HybridCapable, BaseIndex, abc.ABC):
|
||||
"""
|
||||
A valid document index that can plug into all Onyx flows must implement all of these
|
||||
functionalities, though "technically" it does not need to be keyword or vector capable as
|
||||
currently all default search flows use Hybrid Search.
|
||||
"""
|
0
backend/onyx/document_index/vespa/__init__.py
Normal file
0
backend/onyx/document_index/vespa/__init__.py
Normal file
@ -0,0 +1,221 @@
|
||||
schema DANSWER_CHUNK_NAME {
|
||||
document DANSWER_CHUNK_NAME {
|
||||
TENANT_ID_REPLACEMENT
|
||||
# Not to be confused with the UUID generated for this chunk which is called documentid by default
|
||||
field document_id type string {
|
||||
indexing: summary | attribute
|
||||
attribute: fast-search
|
||||
rank: filter
|
||||
}
|
||||
field chunk_id type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# Displayed in the UI as the main identifier for the doc
|
||||
field semantic_identifier type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# Must have an additional field for whether to skip title embeddings
|
||||
# This information cannot be extracted from either the title field nor title embedding
|
||||
field skip_title type bool {
|
||||
indexing: attribute
|
||||
}
|
||||
# May not always match the `semantic_identifier` e.g. for Slack docs the
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
indexing: summary | index | attribute
|
||||
index: enable-bm25
|
||||
}
|
||||
field content type string {
|
||||
indexing: summary | index
|
||||
index: enable-bm25
|
||||
}
|
||||
# duplication of `content` is far from ideal, but is needed for
|
||||
# non-gram based highlighting for now. If the capability to re-use a
|
||||
# single field to do both is added, `content_summary` should be removed
|
||||
field content_summary type string {
|
||||
indexing: summary | index
|
||||
summary: dynamic
|
||||
}
|
||||
# Title embedding (x1)
|
||||
field title_embedding type tensor<float>(x[VARIABLE_DIM]) {
|
||||
indexing: attribute | index
|
||||
attribute {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
# Content embeddings (chunk + optional mini chunks embeddings)
|
||||
# "t" and "x" are arbitrary names, not special keywords
|
||||
field embeddings type tensor<float>(t{},x[VARIABLE_DIM]) {
|
||||
indexing: attribute | index
|
||||
attribute {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
# Starting section of the doc, currently unused as it has been replaced by match highlighting
|
||||
field blurb type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
|
||||
field source_type type string {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
# Can also index links https://docs.vespa.ai/en/reference/schema-reference.html#attribute
|
||||
# URL type matching
|
||||
field source_links type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field section_continuation type bool {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# Technically this one should be int, but can't change without causing breaks to existing index
|
||||
field boost type float {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field hidden type bool {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
}
|
||||
# Needs to have a separate Attribute list for efficient filtering
|
||||
field metadata_list type array<string> {
|
||||
indexing: summary | attribute
|
||||
rank:filter
|
||||
attribute: fast-search
|
||||
}
|
||||
# If chunk is a large chunk, this will contain the ids of the smaller chunks
|
||||
field large_chunk_reference_ids type array<int> {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field metadata type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field metadata_suffix type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field doc_updated_at type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field primary_owners type array<string> {
|
||||
indexing : summary | attribute
|
||||
}
|
||||
field secondary_owners type array<string> {
|
||||
indexing : summary | attribute
|
||||
}
|
||||
field access_control_list type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field document_sets type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
|
||||
# If using different tokenization settings, the fieldset has to be removed, and the field must
|
||||
# be specified in the yql like:
|
||||
# + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
|
||||
# + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
|
||||
# Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
|
||||
fieldset default {
|
||||
fields: content, title
|
||||
}
|
||||
|
||||
rank-profile default_rank {
|
||||
inputs {
|
||||
query(decay_factor) float
|
||||
}
|
||||
|
||||
function inline document_boost() {
|
||||
# 0.5 to 2x score: piecewise sigmoid function stretched out by factor of 3
|
||||
# meaning requires 3x the number of feedback votes to have default sigmoid effect
|
||||
expression: if(attribute(boost) < 0, 0.5 + (1 / (1 + exp(-attribute(boost) / 3))), 2 / (1 + exp(-attribute(boost) / 3)))
|
||||
}
|
||||
|
||||
function inline document_age() {
|
||||
# Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found)
|
||||
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
|
||||
}
|
||||
|
||||
# Document score decays from 1 to 0.75 as age of last updated time increases
|
||||
function inline recency_bias() {
|
||||
expression: max(1 / (1 + query(decay_factor) * document_age), 0.75)
|
||||
}
|
||||
|
||||
match-features: recency_bias
|
||||
}
|
||||
|
||||
rank-profile hybrid_searchVARIABLE_DIM inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
|
||||
}
|
||||
|
||||
function title_vector_score() {
|
||||
expression {
|
||||
# If no good matching titles, then it should use the context embeddings rather than having some
|
||||
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
|
||||
# matching content score getting the full score
|
||||
max(closeness(field, embeddings), closeness(field, title_embedding))
|
||||
}
|
||||
}
|
||||
|
||||
# First phase must be vector to allow hits that have no keyword matches
|
||||
first-phase {
|
||||
expression: closeness(field, embeddings)
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
global-phase {
|
||||
expression {
|
||||
(
|
||||
# Weighted Vector Similarity Score
|
||||
(
|
||||
query(alpha) * (
|
||||
(query(title_content_ratio) * normalize_linear(title_vector_score))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
|
||||
)
|
||||
)
|
||||
|
||||
+
|
||||
|
||||
# Weighted Keyword Similarity Score
|
||||
# Note: for the BM25 Title score, it requires decent stopword removal in the query
|
||||
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
|
||||
(
|
||||
(1 - query(alpha)) * (
|
||||
(query(title_content_ratio) * normalize_linear(bm25(title)))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
|
||||
)
|
||||
)
|
||||
)
|
||||
# Boost based on user feedback
|
||||
* document_boost
|
||||
# Decay factor based on time document was last updated
|
||||
* recency_bias
|
||||
}
|
||||
rerank-count: 1000
|
||||
}
|
||||
|
||||
match-features {
|
||||
bm25(title)
|
||||
bm25(content)
|
||||
closeness(field, title_embedding)
|
||||
closeness(field, embeddings)
|
||||
document_boost
|
||||
recency_bias
|
||||
closest(embeddings)
|
||||
}
|
||||
}
|
||||
|
||||
# Used when searching from the admin UI for a specific doc to hide / boost
|
||||
# Very heavily prioritize title
|
||||
rank-profile admin_search inherits default, default_rank {
|
||||
first-phase {
|
||||
expression: bm25(content) + (5 * bm25(title))
|
||||
}
|
||||
}
|
||||
}
|
47
backend/onyx/document_index/vespa/app_config/services.xml
Normal file
47
backend/onyx/document_index/vespa/app_config/services.xml
Normal file
@ -0,0 +1,47 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<services version="1.0">
|
||||
<container id="default" version="1.0">
|
||||
<document-api/>
|
||||
<search/>
|
||||
<http>
|
||||
<server id="default" port="8081"/>
|
||||
</http>
|
||||
<nodes>
|
||||
<node hostalias="danswer-node" />
|
||||
</nodes>
|
||||
</container>
|
||||
<content id="danswer_index" version="1.0">
|
||||
<redundancy>1</redundancy>
|
||||
<documents>
|
||||
<!-- <document type="danswer_chunk" mode="index" /> -->
|
||||
DOCUMENT_REPLACEMENT
|
||||
</documents>
|
||||
<nodes>
|
||||
<node hostalias="danswer-node" distribution-key="0" />
|
||||
</nodes>
|
||||
<tuning>
|
||||
<resource-limits>
|
||||
<!-- Default is 75% but this can be increased for Dockerized deployments -->
|
||||
<!-- https://docs.vespa.ai/en/operations/feed-block.html -->
|
||||
<disk>0.75</disk>
|
||||
</resource-limits>
|
||||
</tuning>
|
||||
<engine>
|
||||
<proton>
|
||||
<tuning>
|
||||
<searchnode>
|
||||
<requestthreads>
|
||||
<persearch>SEARCH_THREAD_NUMBER</persearch>
|
||||
</requestthreads>
|
||||
</searchnode>
|
||||
</tuning>
|
||||
</proton>
|
||||
</engine>
|
||||
<config name="vespa.config.search.summary.juniperrc">
|
||||
<max_matches>3</max_matches>
|
||||
<length>750</length>
|
||||
<surround_max>350</surround_max>
|
||||
<min_length>300</min_length>
|
||||
</config>
|
||||
</content>
|
||||
</services>
|
@ -0,0 +1,8 @@
|
||||
<validation-overrides>
|
||||
<allow
|
||||
until="DATE_REPLACEMENT"
|
||||
comment="We need to be able to create/delete indices for swapping models">schema-removal</allow>
|
||||
<allow
|
||||
until="DATE_REPLACEMENT"
|
||||
comment="We need to be able to update the schema for updates to the Onyx schema">indexing-change</allow>
|
||||
</validation-overrides>
|
430
backend/onyx/document_index/vespa/chunk_retrieval.py
Normal file
430
backend/onyx/document_index/vespa/chunk_retrieval.py
Normal file
@ -0,0 +1,430 @@
|
||||
import json
|
||||
import string
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Mapping
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import httpx
|
||||
from retry import retry
|
||||
|
||||
from onyx.configs.app_configs import LOG_VESPA_TIMING_INFORMATION
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
||||
build_vespa_filters,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
||||
build_vespa_id_based_retrieval_yql,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||
from onyx.document_index.vespa_constants import BLURB
|
||||
from onyx.document_index.vespa_constants import BOOST
|
||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||
from onyx.document_index.vespa_constants import CONTENT
|
||||
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
|
||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
|
||||
from onyx.document_index.vespa_constants import MAX_ID_SEARCH_QUERY_SIZE
|
||||
from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
|
||||
from onyx.document_index.vespa_constants import METADATA
|
||||
from onyx.document_index.vespa_constants import METADATA_SUFFIX
|
||||
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
|
||||
from onyx.document_index.vespa_constants import RECENCY_BIAS
|
||||
from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
|
||||
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
|
||||
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
|
||||
from onyx.document_index.vespa_constants import SOURCE_LINKS
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import YQL_BASE
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _process_dynamic_summary(
|
||||
dynamic_summary: str, max_summary_length: int = 400
|
||||
) -> list[str]:
|
||||
if not dynamic_summary:
|
||||
return []
|
||||
|
||||
current_length = 0
|
||||
processed_summary: list[str] = []
|
||||
for summary_section in dynamic_summary.split("<sep />"):
|
||||
# if we're past the desired max length, break at the last word
|
||||
if current_length + len(summary_section) >= max_summary_length:
|
||||
summary_section = summary_section[: max_summary_length - current_length]
|
||||
summary_section = summary_section.lstrip() # remove any leading whitespace
|
||||
|
||||
# handle the case where the truncated section is either just a
|
||||
# single (partial) word or if it's empty
|
||||
first_space = summary_section.find(" ")
|
||||
if first_space == -1:
|
||||
# add ``...`` to previous section
|
||||
if processed_summary:
|
||||
processed_summary[-1] += "..."
|
||||
break
|
||||
|
||||
# handle the valid truncated section case
|
||||
summary_section = summary_section.rsplit(" ", 1)[0]
|
||||
if summary_section[-1] in string.punctuation:
|
||||
summary_section = summary_section[:-1]
|
||||
summary_section += "..."
|
||||
processed_summary.append(summary_section)
|
||||
break
|
||||
|
||||
processed_summary.append(summary_section)
|
||||
current_length += len(summary_section)
|
||||
|
||||
return processed_summary
|
||||
|
||||
|
||||
def _vespa_hit_to_inference_chunk(
|
||||
hit: dict[str, Any], null_score: bool = False
|
||||
) -> InferenceChunkUncleaned:
|
||||
fields = cast(dict[str, Any], hit["fields"])
|
||||
|
||||
# parse fields that are stored as strings, but are really json / datetime
|
||||
metadata = json.loads(fields[METADATA]) if METADATA in fields else {}
|
||||
updated_at = (
|
||||
datetime.fromtimestamp(fields[DOC_UPDATED_AT], tz=timezone.utc)
|
||||
if DOC_UPDATED_AT in fields
|
||||
else None
|
||||
)
|
||||
|
||||
match_highlights = _process_dynamic_summary(
|
||||
# fallback to regular `content` if the `content_summary` field
|
||||
# isn't present
|
||||
dynamic_summary=hit["fields"].get(CONTENT_SUMMARY, hit["fields"][CONTENT]),
|
||||
)
|
||||
semantic_identifier = fields.get(SEMANTIC_IDENTIFIER, "")
|
||||
if not semantic_identifier:
|
||||
logger.error(
|
||||
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||
)
|
||||
|
||||
source_links = fields.get(SOURCE_LINKS, {})
|
||||
source_links_dict_unprocessed = (
|
||||
json.loads(source_links) if isinstance(source_links, str) else source_links
|
||||
)
|
||||
source_links_dict = {
|
||||
int(k): v
|
||||
for k, v in cast(dict[str, str], source_links_dict_unprocessed).items()
|
||||
}
|
||||
|
||||
return InferenceChunkUncleaned(
|
||||
chunk_id=fields[CHUNK_ID],
|
||||
blurb=fields.get(BLURB, ""), # Unused
|
||||
content=fields[CONTENT], # Includes extra title prefix and metadata suffix
|
||||
source_links=source_links_dict or {0: ""},
|
||||
section_continuation=fields[SECTION_CONTINUATION],
|
||||
document_id=fields[DOCUMENT_ID],
|
||||
source_type=fields[SOURCE_TYPE],
|
||||
title=fields.get(TITLE),
|
||||
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
|
||||
boost=fields.get(BOOST, 1),
|
||||
recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
|
||||
score=None if null_score else hit.get("relevance", 0),
|
||||
hidden=fields.get(HIDDEN, False),
|
||||
primary_owners=fields.get(PRIMARY_OWNERS),
|
||||
secondary_owners=fields.get(SECONDARY_OWNERS),
|
||||
large_chunk_reference_ids=fields.get(LARGE_CHUNK_REFERENCE_IDS, []),
|
||||
metadata=metadata,
|
||||
metadata_suffix=fields.get(METADATA_SUFFIX),
|
||||
match_highlights=match_highlights,
|
||||
updated_at=updated_at,
|
||||
)
|
||||
|
||||
|
||||
def _get_chunks_via_visit_api(
|
||||
chunk_request: VespaChunkRequest,
|
||||
index_name: str,
|
||||
filters: IndexFilters,
|
||||
field_names: list[str] | None = None,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[dict]:
|
||||
# Constructing the URL for the Visit API
|
||||
# NOTE: visit API uses the same URL as the document API, but with different params
|
||||
url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
|
||||
|
||||
# build the list of fields to retrieve
|
||||
field_set_list = (
|
||||
None
|
||||
if not field_names
|
||||
else [f"{index_name}:{field_name}" for field_name in field_names]
|
||||
)
|
||||
acl_fieldset_entry = f"{index_name}:{ACCESS_CONTROL_LIST}"
|
||||
if (
|
||||
field_set_list
|
||||
and filters.access_control_list
|
||||
and acl_fieldset_entry not in field_set_list
|
||||
):
|
||||
field_set_list.append(acl_fieldset_entry)
|
||||
field_set = ",".join(field_set_list) if field_set_list else None
|
||||
|
||||
# build filters
|
||||
selection = f"{index_name}.document_id=='{chunk_request.document_id}'"
|
||||
|
||||
if chunk_request.is_capped:
|
||||
selection += f" and {index_name}.chunk_id>={chunk_request.min_chunk_ind or 0}"
|
||||
selection += f" and {index_name}.chunk_id<={chunk_request.max_chunk_ind}"
|
||||
if not get_large_chunks:
|
||||
selection += f" and {index_name}.large_chunk_reference_ids == null"
|
||||
|
||||
# Setting up the selection criteria in the query parameters
|
||||
params = {
|
||||
# NOTE: Document Selector Language doesn't allow `contains`, so we can't check
|
||||
# for the ACL in the selection. Instead, we have to check as a postfilter
|
||||
"selection": selection,
|
||||
"continuation": None,
|
||||
"wantedDocumentCount": 1_000,
|
||||
"fieldSet": field_set,
|
||||
}
|
||||
|
||||
document_chunks: list[dict] = []
|
||||
while True:
|
||||
try:
|
||||
filtered_params = {k: v for k, v in params.items() if v is not None}
|
||||
with get_vespa_http_client() as http_client:
|
||||
response = http_client.get(url, params=filtered_params)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
error_base = "Failed to query Vespa"
|
||||
logger.error(
|
||||
f"{error_base}:\n"
|
||||
f"Request URL: {e.request.url}\n"
|
||||
f"Request Headers: {e.request.headers}\n"
|
||||
f"Request Payload: {params}\n"
|
||||
f"Exception: {str(e)}"
|
||||
)
|
||||
raise httpx.HTTPError(error_base) from e
|
||||
|
||||
# Check if the response contains any documents
|
||||
response_data = response.json()
|
||||
if "documents" in response_data:
|
||||
for document in response_data["documents"]:
|
||||
if filters.access_control_list:
|
||||
document_acl = document["fields"].get(ACCESS_CONTROL_LIST)
|
||||
if not document_acl or not any(
|
||||
user_acl_entry in document_acl
|
||||
for user_acl_entry in filters.access_control_list
|
||||
):
|
||||
continue
|
||||
document_chunks.append(document)
|
||||
|
||||
# Check for continuation token to handle pagination
|
||||
if "continuation" in response_data and response_data["continuation"]:
|
||||
params["continuation"] = response_data["continuation"]
|
||||
else:
|
||||
break # Exit loop if no continuation token
|
||||
|
||||
return document_chunks
|
||||
|
||||
|
||||
@retry(tries=10, delay=1, backoff=2)
|
||||
def get_all_vespa_ids_for_document_id(
|
||||
document_id: str,
|
||||
index_name: str,
|
||||
filters: IndexFilters | None = None,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[str]:
|
||||
document_chunks = _get_chunks_via_visit_api(
|
||||
chunk_request=VespaChunkRequest(document_id=document_id),
|
||||
index_name=index_name,
|
||||
filters=filters or IndexFilters(access_control_list=None),
|
||||
field_names=[DOCUMENT_ID],
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
|
||||
|
||||
|
||||
def parallel_visit_api_retrieval(
|
||||
index_name: str,
|
||||
chunk_requests: list[VespaChunkRequest],
|
||||
filters: IndexFilters,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
functions_with_args: list[tuple[Callable, tuple]] = [
|
||||
(
|
||||
_get_chunks_via_visit_api,
|
||||
(chunk_request, index_name, filters, get_large_chunks),
|
||||
)
|
||||
for chunk_request in chunk_requests
|
||||
]
|
||||
|
||||
parallel_results = run_functions_tuples_in_parallel(
|
||||
functions_with_args, allow_failures=True
|
||||
)
|
||||
|
||||
# Any failures to retrieve would give a None, drop the Nones and empty lists
|
||||
vespa_chunk_sets = [res for res in parallel_results if res]
|
||||
|
||||
flattened_vespa_chunks = []
|
||||
for chunk_set in vespa_chunk_sets:
|
||||
flattened_vespa_chunks.extend(chunk_set)
|
||||
|
||||
inference_chunks = [
|
||||
_vespa_hit_to_inference_chunk(chunk, null_score=True)
|
||||
for chunk in flattened_vespa_chunks
|
||||
]
|
||||
|
||||
return inference_chunks
|
||||
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def query_vespa(
|
||||
query_params: Mapping[str, str | int | float]
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError("No/empty query received")
|
||||
|
||||
params = dict(
|
||||
**query_params,
|
||||
**{
|
||||
"presentation.timing": True,
|
||||
}
|
||||
if LOG_VESPA_TIMING_INFORMATION
|
||||
else {},
|
||||
)
|
||||
|
||||
try:
|
||||
with get_vespa_http_client() as http_client:
|
||||
response = http_client.post(SEARCH_ENDPOINT, json=params)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
error_base = "Failed to query Vespa"
|
||||
logger.error(
|
||||
f"{error_base}:\n"
|
||||
f"Request URL: {e.request.url}\n"
|
||||
f"Request Headers: {e.request.headers}\n"
|
||||
f"Request Payload: {params}\n"
|
||||
f"Exception: {str(e)}"
|
||||
)
|
||||
raise httpx.HTTPError(error_base) from e
|
||||
|
||||
response_json: dict[str, Any] = response.json()
|
||||
|
||||
if LOG_VESPA_TIMING_INFORMATION:
|
||||
logger.debug("Vespa timing info: %s", response_json.get("timing"))
|
||||
hits = response_json["root"].get("children", [])
|
||||
|
||||
if not hits:
|
||||
logger.warning(
|
||||
f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
|
||||
)
|
||||
logger.debug(f"Vespa Response: {response.text}")
|
||||
|
||||
for hit in hits:
|
||||
if hit["fields"].get(CONTENT) is None:
|
||||
identifier = hit["fields"].get("documentid") or hit["id"]
|
||||
logger.error(
|
||||
f"Vespa Index with Vespa ID {identifier} has no contents. "
|
||||
f"This is invalid because the vector is not meaningful and keywordsearch cannot "
|
||||
f"fetch this document"
|
||||
)
|
||||
|
||||
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
|
||||
|
||||
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
|
||||
# Good Debugging Spot
|
||||
return inference_chunks
|
||||
|
||||
|
||||
def _get_chunks_via_batch_search(
|
||||
index_name: str,
|
||||
chunk_requests: list[VespaChunkRequest],
|
||||
filters: IndexFilters,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
if not chunk_requests:
|
||||
return []
|
||||
|
||||
filters_str = build_vespa_filters(filters=filters, include_hidden=True)
|
||||
|
||||
yql = (
|
||||
YQL_BASE.format(index_name=index_name)
|
||||
+ filters_str
|
||||
+ build_vespa_id_based_retrieval_yql(chunk_requests[0])
|
||||
)
|
||||
chunk_requests.pop(0)
|
||||
|
||||
for request in chunk_requests:
|
||||
yql += " or " + build_vespa_id_based_retrieval_yql(request)
|
||||
params: dict[str, str | int | float] = {
|
||||
"yql": yql,
|
||||
"hits": MAX_ID_SEARCH_QUERY_SIZE,
|
||||
}
|
||||
|
||||
inference_chunks = query_vespa(params)
|
||||
if not get_large_chunks:
|
||||
inference_chunks = [
|
||||
chunk for chunk in inference_chunks if not chunk.large_chunk_reference_ids
|
||||
]
|
||||
inference_chunks.sort(key=lambda chunk: chunk.chunk_id)
|
||||
return inference_chunks
|
||||
|
||||
|
||||
def batch_search_api_retrieval(
|
||||
index_name: str,
|
||||
chunk_requests: list[VespaChunkRequest],
|
||||
filters: IndexFilters,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
retrieved_chunks: list[InferenceChunkUncleaned] = []
|
||||
capped_requests: list[VespaChunkRequest] = []
|
||||
uncapped_requests: list[VespaChunkRequest] = []
|
||||
chunk_count = 0
|
||||
for req_ind, request in enumerate(chunk_requests, start=1):
|
||||
# All requests without a chunk range are uncapped
|
||||
# Uncapped requests are retrieved using the Visit API
|
||||
range = request.range
|
||||
if range is None:
|
||||
uncapped_requests.append(request)
|
||||
continue
|
||||
|
||||
if (
|
||||
chunk_count + range > MAX_ID_SEARCH_QUERY_SIZE
|
||||
or req_ind % MAX_OR_CONDITIONS == 0
|
||||
):
|
||||
retrieved_chunks.extend(
|
||||
_get_chunks_via_batch_search(
|
||||
index_name=index_name,
|
||||
chunk_requests=capped_requests,
|
||||
filters=filters,
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
)
|
||||
capped_requests = []
|
||||
chunk_count = 0
|
||||
capped_requests.append(request)
|
||||
chunk_count += range
|
||||
|
||||
if capped_requests:
|
||||
retrieved_chunks.extend(
|
||||
_get_chunks_via_batch_search(
|
||||
index_name=index_name,
|
||||
chunk_requests=capped_requests,
|
||||
filters=filters,
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
)
|
||||
|
||||
if uncapped_requests:
|
||||
logger.debug(f"Retrieving {len(uncapped_requests)} uncapped requests")
|
||||
retrieved_chunks.extend(
|
||||
parallel_visit_api_retrieval(
|
||||
index_name, uncapped_requests, filters, get_large_chunks
|
||||
)
|
||||
)
|
||||
|
||||
return retrieved_chunks
|
65
backend/onyx/document_index/vespa/deletion.py
Normal file
65
backend/onyx/document_index/vespa/deletion.py
Normal file
@ -0,0 +1,65 @@
|
||||
import concurrent.futures
|
||||
|
||||
import httpx
|
||||
from retry import retry
|
||||
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
get_all_vespa_ids_for_document_id,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import NUM_THREADS
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
CONTENT_SUMMARY = "content_summary"
|
||||
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _delete_vespa_doc_chunks(
|
||||
document_id: str, index_name: str, http_client: httpx.Client
|
||||
) -> None:
|
||||
doc_chunk_ids = get_all_vespa_ids_for_document_id(
|
||||
document_id=document_id,
|
||||
index_name=index_name,
|
||||
get_large_chunks=True,
|
||||
)
|
||||
|
||||
for chunk_id in doc_chunk_ids:
|
||||
try:
|
||||
res = http_client.delete(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{chunk_id}"
|
||||
)
|
||||
res.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"Failed to delete chunk, details: {e.response.text}")
|
||||
raise
|
||||
|
||||
|
||||
def delete_vespa_docs(
|
||||
document_ids: list[str],
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
executor: concurrent.futures.ThreadPoolExecutor | None = None,
|
||||
) -> None:
|
||||
external_executor = True
|
||||
|
||||
if not executor:
|
||||
external_executor = False
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
|
||||
|
||||
try:
|
||||
doc_deletion_future = {
|
||||
executor.submit(
|
||||
_delete_vespa_doc_chunks, doc_id, index_name, http_client
|
||||
): doc_id
|
||||
for doc_id in document_ids
|
||||
}
|
||||
for future in concurrent.futures.as_completed(doc_deletion_future):
|
||||
# Will raise exception if the deletion raised an exception
|
||||
future.result()
|
||||
|
||||
finally:
|
||||
if not external_executor:
|
||||
executor.shutdown(wait=True)
|
915
backend/onyx/document_index/vespa/index.py
Normal file
915
backend/onyx/document_index/vespa/index.py
Normal file
@ -0,0 +1,915 @@
|
||||
import concurrent.futures
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import BinaryIO
|
||||
from typing import cast
|
||||
from typing import List
|
||||
|
||||
import httpx # type: ignore
|
||||
import requests # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from onyx.configs.chat_configs import DOC_TIME_DECAY
|
||||
from onyx.configs.chat_configs import NUM_RETURNED_HITS
|
||||
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
|
||||
from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
|
||||
from onyx.configs.constants import KV_REINDEX_KEY
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentInsertionRecord
|
||||
from onyx.document_index.interfaces import UpdateRequest
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
get_all_vespa_ids_for_document_id,
|
||||
)
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
parallel_visit_api_retrieval,
|
||||
)
|
||||
from onyx.document_index.vespa.chunk_retrieval import query_vespa
|
||||
from onyx.document_index.vespa.deletion import delete_vespa_docs
|
||||
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
|
||||
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
|
||||
from onyx.document_index.vespa.indexing_utils import (
|
||||
get_existing_documents_from_chunks,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
||||
build_vespa_filters,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||
from onyx.document_index.vespa_constants import BATCH_SIZE
|
||||
from onyx.document_index.vespa_constants import BOOST
|
||||
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
|
||||
from onyx.document_index.vespa_constants import DANSWER_CHUNK_REPLACEMENT_PAT
|
||||
from onyx.document_index.vespa_constants import DATE_REPLACEMENT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_REPLACEMENT_PAT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
||||
from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import NUM_THREADS
|
||||
from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
|
||||
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
|
||||
from onyx.document_index.vespa_constants import VESPA_TIMEOUT
|
||||
from onyx.document_index.vespa_constants import YQL_BASE
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.key_value_store.factory import get_kv_store
|
||||
from onyx.utils.batching import batch_generator
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.model_server_models import Embedding
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Set the logging level to WARNING to ignore INFO and DEBUG logs
|
||||
httpx_logger = logging.getLogger("httpx")
|
||||
httpx_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _VespaUpdateRequest:
|
||||
document_id: str
|
||||
url: str
|
||||
update_request: dict[str, dict]
|
||||
|
||||
|
||||
def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO:
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
for filename, content in file_contents.items():
|
||||
zipf.writestr(filename, content)
|
||||
zip_buffer.seek(0)
|
||||
return zip_buffer
|
||||
|
||||
|
||||
def _create_document_xml_lines(doc_names: list[str | None] | list[str]) -> str:
|
||||
doc_lines = [
|
||||
f'<document type="{doc_name}" mode="index" />'
|
||||
for doc_name in doc_names
|
||||
if doc_name
|
||||
]
|
||||
return "\n".join(doc_lines)
|
||||
|
||||
|
||||
def add_ngrams_to_schema(schema_content: str) -> str:
|
||||
# Add the match blocks containing gram and gram-size to title and content fields
|
||||
schema_content = re.sub(
|
||||
r"(field title type string \{[^}]*indexing: summary \| index \| attribute)",
|
||||
r"\1\n match {\n gram\n gram-size: 3\n }",
|
||||
schema_content,
|
||||
)
|
||||
schema_content = re.sub(
|
||||
r"(field content type string \{[^}]*indexing: summary \| index)",
|
||||
r"\1\n match {\n gram\n gram-size: 3\n }",
|
||||
schema_content,
|
||||
)
|
||||
return schema_content
|
||||
|
||||
|
||||
class VespaIndex(DocumentIndex):
|
||||
def __init__(
|
||||
self,
|
||||
index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
multitenant: bool = False,
|
||||
) -> None:
|
||||
self.index_name = index_name
|
||||
self.secondary_index_name = secondary_index_name
|
||||
self.multitenant = multitenant
|
||||
self.http_client = get_vespa_http_client()
|
||||
|
||||
def ensure_indices_exist(
|
||||
self,
|
||||
index_embedding_dim: int,
|
||||
secondary_index_embedding_dim: int | None,
|
||||
) -> None:
|
||||
if MULTI_TENANT:
|
||||
logger.info(
|
||||
"Skipping Vespa index seup for multitenant (would wipe all indices)"
|
||||
)
|
||||
return None
|
||||
|
||||
deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate"
|
||||
logger.notice(f"Deploying Vespa application package to {deploy_url}")
|
||||
|
||||
vespa_schema_path = os.path.join(
|
||||
os.getcwd(), "onyx", "document_index", "vespa", "app_config"
|
||||
)
|
||||
schema_file = os.path.join(vespa_schema_path, "schemas", "danswer_chunk.sd")
|
||||
services_file = os.path.join(vespa_schema_path, "services.xml")
|
||||
overrides_file = os.path.join(vespa_schema_path, "validation-overrides.xml")
|
||||
|
||||
with open(services_file, "r") as services_f:
|
||||
services_template = services_f.read()
|
||||
|
||||
schema_names = [self.index_name, self.secondary_index_name]
|
||||
|
||||
doc_lines = _create_document_xml_lines(schema_names)
|
||||
services = services_template.replace(DOCUMENT_REPLACEMENT_PAT, doc_lines)
|
||||
services = services.replace(
|
||||
SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS)
|
||||
)
|
||||
|
||||
kv_store = get_kv_store()
|
||||
|
||||
needs_reindexing = False
|
||||
try:
|
||||
needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY))
|
||||
except Exception:
|
||||
logger.debug("Could not load the reindexing flag. Using ngrams")
|
||||
|
||||
with open(overrides_file, "r") as overrides_f:
|
||||
overrides_template = overrides_f.read()
|
||||
|
||||
# Vespa requires an override to erase data including the indices we're no longer using
|
||||
# It also has a 30 day cap from current so we set it to 7 dynamically
|
||||
now = datetime.now()
|
||||
date_in_7_days = now + timedelta(days=7)
|
||||
formatted_date = date_in_7_days.strftime("%Y-%m-%d")
|
||||
|
||||
overrides = overrides_template.replace(DATE_REPLACEMENT, formatted_date)
|
||||
|
||||
zip_dict = {
|
||||
"services.xml": services.encode("utf-8"),
|
||||
"validation-overrides.xml": overrides.encode("utf-8"),
|
||||
}
|
||||
|
||||
with open(schema_file, "r") as schema_f:
|
||||
schema_template = schema_f.read()
|
||||
schema_template = schema_template.replace(TENANT_ID_PAT, "")
|
||||
|
||||
schema = schema_template.replace(
|
||||
DANSWER_CHUNK_REPLACEMENT_PAT, self.index_name
|
||||
).replace(VESPA_DIM_REPLACEMENT_PAT, str(index_embedding_dim))
|
||||
|
||||
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
|
||||
schema = schema.replace(TENANT_ID_PAT, "")
|
||||
zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")
|
||||
|
||||
if self.secondary_index_name:
|
||||
upcoming_schema = schema_template.replace(
|
||||
DANSWER_CHUNK_REPLACEMENT_PAT, self.secondary_index_name
|
||||
).replace(VESPA_DIM_REPLACEMENT_PAT, str(secondary_index_embedding_dim))
|
||||
zip_dict[f"schemas/{schema_names[1]}.sd"] = upcoming_schema.encode("utf-8")
|
||||
|
||||
zip_file = in_memory_zip_from_file_bytes(zip_dict)
|
||||
|
||||
headers = {"Content-Type": "application/zip"}
|
||||
response = requests.post(deploy_url, headers=headers, data=zip_file)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(
|
||||
f"Failed to prepare Vespa Onyx Index. Response: {response.text}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def register_multitenant_indices(
|
||||
indices: list[str],
|
||||
embedding_dims: list[int],
|
||||
) -> None:
|
||||
if not MULTI_TENANT:
|
||||
raise ValueError("Multi-tenant is not enabled")
|
||||
|
||||
deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate"
|
||||
logger.info(f"Deploying Vespa application package to {deploy_url}")
|
||||
|
||||
vespa_schema_path = os.path.join(
|
||||
os.getcwd(), "onyx", "document_index", "vespa", "app_config"
|
||||
)
|
||||
schema_file = os.path.join(vespa_schema_path, "schemas", "danswer_chunk.sd")
|
||||
services_file = os.path.join(vespa_schema_path, "services.xml")
|
||||
overrides_file = os.path.join(vespa_schema_path, "validation-overrides.xml")
|
||||
|
||||
with open(services_file, "r") as services_f:
|
||||
services_template = services_f.read()
|
||||
|
||||
# Generate schema names from index settings
|
||||
schema_names = [index_name for index_name in indices]
|
||||
|
||||
full_schemas = schema_names
|
||||
|
||||
doc_lines = _create_document_xml_lines(full_schemas)
|
||||
|
||||
services = services_template.replace(DOCUMENT_REPLACEMENT_PAT, doc_lines)
|
||||
services = services.replace(
|
||||
SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS)
|
||||
)
|
||||
|
||||
kv_store = get_kv_store()
|
||||
|
||||
needs_reindexing = False
|
||||
try:
|
||||
needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY))
|
||||
except Exception:
|
||||
logger.debug("Could not load the reindexing flag. Using ngrams")
|
||||
|
||||
with open(overrides_file, "r") as overrides_f:
|
||||
overrides_template = overrides_f.read()
|
||||
|
||||
# Vespa requires an override to erase data including the indices we're no longer using
|
||||
# It also has a 30 day cap from current so we set it to 7 dynamically
|
||||
now = datetime.now()
|
||||
date_in_7_days = now + timedelta(days=7)
|
||||
formatted_date = date_in_7_days.strftime("%Y-%m-%d")
|
||||
|
||||
overrides = overrides_template.replace(DATE_REPLACEMENT, formatted_date)
|
||||
|
||||
zip_dict = {
|
||||
"services.xml": services.encode("utf-8"),
|
||||
"validation-overrides.xml": overrides.encode("utf-8"),
|
||||
}
|
||||
|
||||
with open(schema_file, "r") as schema_f:
|
||||
schema_template = schema_f.read()
|
||||
|
||||
for i, index_name in enumerate(indices):
|
||||
embedding_dim = embedding_dims[i]
|
||||
logger.info(
|
||||
f"Creating index: {index_name} with embedding dimension: {embedding_dim}"
|
||||
)
|
||||
|
||||
schema = schema_template.replace(
|
||||
DANSWER_CHUNK_REPLACEMENT_PAT, index_name
|
||||
).replace(VESPA_DIM_REPLACEMENT_PAT, str(embedding_dim))
|
||||
schema = schema.replace(
|
||||
TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
|
||||
)
|
||||
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
|
||||
zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")
|
||||
|
||||
zip_file = in_memory_zip_from_file_bytes(zip_dict)
|
||||
|
||||
headers = {"Content-Type": "application/zip"}
|
||||
response = requests.post(deploy_url, headers=headers, data=zip_file)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(
|
||||
f"Failed to prepare Vespa Onyx Indexes. Response: {response.text}"
|
||||
)
|
||||
|
||||
def index(
|
||||
self,
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
fresh_index: bool = False,
|
||||
) -> set[DocumentInsertionRecord]:
|
||||
"""Receive a list of chunks from a batch of documents and index the chunks into Vespa along
|
||||
with updating the associated permissions. Assumes that a document will not be split into
|
||||
multiple chunk batches calling this function multiple times, otherwise only the last set of
|
||||
chunks will be kept"""
|
||||
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
||||
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
||||
|
||||
existing_docs: set[str] = set()
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
with (
|
||||
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
|
||||
get_vespa_http_client() as http_client,
|
||||
):
|
||||
if not fresh_index:
|
||||
# Check for existing documents, existing documents need to have all of their chunks deleted
|
||||
# prior to indexing as the document size (num chunks) may have shrunk
|
||||
first_chunks = [
|
||||
chunk for chunk in cleaned_chunks if chunk.chunk_id == 0
|
||||
]
|
||||
for chunk_batch in batch_generator(first_chunks, BATCH_SIZE):
|
||||
existing_docs.update(
|
||||
get_existing_documents_from_chunks(
|
||||
chunks=chunk_batch,
|
||||
index_name=self.index_name,
|
||||
http_client=http_client,
|
||||
executor=executor,
|
||||
)
|
||||
)
|
||||
|
||||
for doc_id_batch in batch_generator(existing_docs, BATCH_SIZE):
|
||||
delete_vespa_docs(
|
||||
document_ids=doc_id_batch,
|
||||
index_name=self.index_name,
|
||||
http_client=http_client,
|
||||
executor=executor,
|
||||
)
|
||||
|
||||
for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
|
||||
batch_index_vespa_chunks(
|
||||
chunks=chunk_batch,
|
||||
index_name=self.index_name,
|
||||
http_client=http_client,
|
||||
multitenant=self.multitenant,
|
||||
executor=executor,
|
||||
)
|
||||
|
||||
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
||||
|
||||
return {
|
||||
DocumentInsertionRecord(
|
||||
document_id=doc_id,
|
||||
already_existed=doc_id in existing_docs,
|
||||
)
|
||||
for doc_id in all_doc_ids
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _apply_updates_batched(
|
||||
updates: list[_VespaUpdateRequest],
|
||||
batch_size: int = BATCH_SIZE,
|
||||
) -> None:
|
||||
"""Runs a batch of updates in parallel via the ThreadPoolExecutor."""
|
||||
|
||||
def _update_chunk(
|
||||
update: _VespaUpdateRequest, http_client: httpx.Client
|
||||
) -> httpx.Response:
|
||||
logger.debug(
|
||||
f"Updating with request to {update.url} with body {update.update_request}"
|
||||
)
|
||||
return http_client.put(
|
||||
update.url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update.update_request,
|
||||
)
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
|
||||
with (
|
||||
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
|
||||
get_vespa_http_client() as http_client,
|
||||
):
|
||||
for update_batch in batch_generator(updates, batch_size):
|
||||
future_to_document_id = {
|
||||
executor.submit(
|
||||
_update_chunk,
|
||||
update,
|
||||
http_client,
|
||||
): update.document_id
|
||||
for update in update_batch
|
||||
}
|
||||
for future in concurrent.futures.as_completed(future_to_document_id):
|
||||
res = future.result()
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
failure_msg = f"Failed to update document: {future_to_document_id[future]}"
|
||||
raise requests.HTTPError(failure_msg) from e
|
||||
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
logger.debug(f"Updating {len(update_requests)} documents in Vespa")
|
||||
|
||||
# Handle Vespa character limitations
|
||||
# Mutating update_requests but it's not used later anyway
|
||||
for update_request in update_requests:
|
||||
update_request.document_ids = [
|
||||
replace_invalid_doc_id_characters(doc_id)
|
||||
for doc_id in update_request.document_ids
|
||||
]
|
||||
|
||||
update_start = time.monotonic()
|
||||
|
||||
processed_updates_requests: list[_VespaUpdateRequest] = []
|
||||
all_doc_chunk_ids: dict[str, list[str]] = {}
|
||||
|
||||
# Fetch all chunks for each document ahead of time
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
chunk_id_start_time = time.monotonic()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
|
||||
future_to_doc_chunk_ids = {
|
||||
executor.submit(
|
||||
get_all_vespa_ids_for_document_id,
|
||||
document_id=document_id,
|
||||
index_name=index_name,
|
||||
filters=None,
|
||||
get_large_chunks=True,
|
||||
): (document_id, index_name)
|
||||
for index_name in index_names
|
||||
for update_request in update_requests
|
||||
for document_id in update_request.document_ids
|
||||
}
|
||||
for future in concurrent.futures.as_completed(future_to_doc_chunk_ids):
|
||||
document_id, index_name = future_to_doc_chunk_ids[future]
|
||||
try:
|
||||
doc_chunk_ids = future.result()
|
||||
if document_id not in all_doc_chunk_ids:
|
||||
all_doc_chunk_ids[document_id] = []
|
||||
all_doc_chunk_ids[document_id].extend(doc_chunk_ids)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}"
|
||||
)
|
||||
logger.debug(
|
||||
f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs"
|
||||
)
|
||||
|
||||
# Build the _VespaUpdateRequest objects
|
||||
for update_request in update_requests:
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
if update_request.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": update_request.boost}
|
||||
if update_request.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {
|
||||
document_set: 1 for document_set in update_request.document_sets
|
||||
}
|
||||
}
|
||||
if update_request.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {
|
||||
acl_entry: 1 for acl_entry in update_request.access.to_acl()
|
||||
}
|
||||
}
|
||||
if update_request.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": update_request.hidden}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update")
|
||||
continue
|
||||
|
||||
for document_id in update_request.document_ids:
|
||||
for doc_chunk_id in all_doc_chunk_ids[document_id]:
|
||||
processed_updates_requests.append(
|
||||
_VespaUpdateRequest(
|
||||
document_id=document_id,
|
||||
url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}",
|
||||
update_request=update_dict,
|
||||
)
|
||||
)
|
||||
|
||||
self._apply_updates_batched(processed_updates_requests)
|
||||
logger.debug(
|
||||
"Finished updating Vespa documents in %.2f seconds",
|
||||
time.monotonic() - update_start,
|
||||
)
|
||||
|
||||
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
|
||||
"""Note: if the document id does not exist, the update will be a no-op and the
|
||||
function will complete with no errors or exceptions.
|
||||
Handle other exceptions if you wish to implement retry behavior
|
||||
"""
|
||||
|
||||
total_chunks_updated = 0
|
||||
|
||||
# Handle Vespa character limitations
|
||||
# Mutating update_request but it's not used later anyway
|
||||
normalized_doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||
|
||||
# Build the _VespaUpdateRequest objects
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update")
|
||||
return 0
|
||||
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
"selection": f"{index_name}.document_id=='{normalized_doc_id}'",
|
||||
"cluster": DOCUMENT_INDEX_NAME,
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
resp = http_client.put(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}",
|
||||
params=params,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
)
|
||||
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(
|
||||
f"Failed to update chunks, details: {e.response.text}"
|
||||
)
|
||||
raise
|
||||
|
||||
resp_data = resp.json()
|
||||
|
||||
if "documentCount" in resp_data:
|
||||
chunks_updated = resp_data["documentCount"]
|
||||
total_chunks_updated += chunks_updated
|
||||
|
||||
# Check for continuation token to handle pagination
|
||||
if "continuation" not in resp_data:
|
||||
break # Exit loop if no continuation token
|
||||
|
||||
if not resp_data["continuation"]:
|
||||
break # Exit loop if continuation token is empty
|
||||
|
||||
params = params.set("continuation", resp_data["continuation"])
|
||||
|
||||
logger.debug(
|
||||
f"VespaIndex.update_single: "
|
||||
f"index={index_name} "
|
||||
f"doc={normalized_doc_id} "
|
||||
f"chunks_updated={total_chunks_updated}"
|
||||
)
|
||||
|
||||
return total_chunks_updated
|
||||
|
||||
def delete(self, doc_ids: list[str]) -> None:
|
||||
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
|
||||
|
||||
doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
with get_vespa_http_client() as http_client:
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
for index_name in index_names:
|
||||
delete_vespa_docs(
|
||||
document_ids=doc_ids, index_name=index_name, http_client=http_client
|
||||
)
|
||||
return
|
||||
|
||||
def delete_single(self, doc_id: str) -> int:
|
||||
"""Possibly faster overall than the delete method due to using a single
|
||||
delete call with a selection query."""
|
||||
|
||||
total_chunks_deleted = 0
|
||||
|
||||
# Vespa deletion is poorly documented ... luckily we found this
|
||||
# https://docs.vespa.ai/en/operations/batch-delete.html#example
|
||||
|
||||
doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
"selection": f"{index_name}.document_id=='{doc_id}'",
|
||||
"cluster": DOCUMENT_INDEX_NAME,
|
||||
}
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
resp = http_client.delete(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
|
||||
params=params,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(
|
||||
f"Failed to delete chunk, details: {e.response.text}"
|
||||
)
|
||||
raise
|
||||
|
||||
resp_data = resp.json()
|
||||
|
||||
if "documentCount" in resp_data:
|
||||
chunks_deleted = resp_data["documentCount"]
|
||||
total_chunks_deleted += chunks_deleted
|
||||
|
||||
# Check for continuation token to handle pagination
|
||||
if "continuation" not in resp_data:
|
||||
break # Exit loop if no continuation token
|
||||
|
||||
if not resp_data["continuation"]:
|
||||
break # Exit loop if continuation token is empty
|
||||
|
||||
params = params.set("continuation", resp_data["continuation"])
|
||||
|
||||
logger.debug(
|
||||
f"VespaIndex.delete_single: "
|
||||
f"index={index_name} "
|
||||
f"doc={doc_id} "
|
||||
f"chunks_deleted={total_chunks_deleted}"
|
||||
)
|
||||
|
||||
return total_chunks_deleted
|
||||
|
||||
def id_based_retrieval(
|
||||
self,
|
||||
chunk_requests: list[VespaChunkRequest],
|
||||
filters: IndexFilters,
|
||||
batch_retrieval: bool = False,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
if batch_retrieval:
|
||||
return batch_search_api_retrieval(
|
||||
index_name=self.index_name,
|
||||
chunk_requests=chunk_requests,
|
||||
filters=filters,
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
return parallel_visit_api_retrieval(
|
||||
index_name=self.index_name,
|
||||
chunk_requests=chunk_requests,
|
||||
filters=filters,
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
|
||||
def hybrid_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: Embedding,
|
||||
final_keywords: list[str] | None,
|
||||
filters: IndexFilters,
|
||||
hybrid_alpha: float,
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
vespa_where_clauses = build_vespa_filters(filters)
|
||||
# Needs to be at least as much as the value set in Vespa schema config
|
||||
target_hits = max(10 * num_to_retrieve, 1000)
|
||||
yql = (
|
||||
YQL_BASE.format(index_name=self.index_name)
|
||||
+ vespa_where_clauses
|
||||
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
|
||||
+ f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
|
||||
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
)
|
||||
|
||||
final_query = " ".join(final_keywords) if final_keywords else query
|
||||
|
||||
logger.debug(f"Query YQL: {yql}")
|
||||
|
||||
params: dict[str, str | int | float] = {
|
||||
"yql": yql,
|
||||
"query": final_query,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"input.query(decay_factor)": str(DOC_TIME_DECAY * time_decay_multiplier),
|
||||
"input.query(alpha)": hybrid_alpha,
|
||||
"input.query(title_content_ratio)": title_content_ratio
|
||||
if title_content_ratio is not None
|
||||
else TITLE_CONTENT_RATIO,
|
||||
"hits": num_to_retrieve,
|
||||
"offset": offset,
|
||||
"ranking.profile": f"hybrid_search{len(query_embedding)}",
|
||||
"timeout": VESPA_TIMEOUT,
|
||||
}
|
||||
|
||||
return query_vespa(params)
|
||||
|
||||
def admin_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
filters: IndexFilters,
|
||||
num_to_retrieve: int = NUM_RETURNED_HITS,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
vespa_where_clauses = build_vespa_filters(filters, include_hidden=True)
|
||||
yql = (
|
||||
YQL_BASE.format(index_name=self.index_name)
|
||||
+ vespa_where_clauses
|
||||
+ '({grammar: "weakAnd"}userInput(@query) '
|
||||
# `({defaultIndex: "content_summary"}userInput(@query))` section is
|
||||
# needed for highlighting while the N-gram highlighting is broken /
|
||||
# not working as desired
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
)
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": "admin_search",
|
||||
"timeout": VESPA_TIMEOUT,
|
||||
}
|
||||
|
||||
return query_vespa(params)
|
||||
|
||||
@classmethod
|
||||
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None:
|
||||
"""
|
||||
Deletes all entries in the specified index with the given tenant_id.
|
||||
|
||||
Parameters:
|
||||
tenant_id (str): The tenant ID whose documents are to be deleted.
|
||||
index_name (str): The name of the index from which to delete documents.
|
||||
"""
|
||||
logger.info(
|
||||
f"Deleting entries with tenant_id: {tenant_id} from index: {index_name}"
|
||||
)
|
||||
|
||||
# Step 1: Retrieve all document IDs with the given tenant_id
|
||||
document_ids = cls._get_all_document_ids_by_tenant_id(tenant_id, index_name)
|
||||
|
||||
if not document_ids:
|
||||
logger.info(
|
||||
f"No documents found with tenant_id: {tenant_id} in index: {index_name}"
|
||||
)
|
||||
return
|
||||
|
||||
# Step 2: Delete documents in batches
|
||||
delete_requests = [
|
||||
_VespaDeleteRequest(document_id=doc_id, index_name=index_name)
|
||||
for doc_id in document_ids
|
||||
]
|
||||
|
||||
cls._apply_deletes_batched(delete_requests)
|
||||
|
||||
@classmethod
|
||||
def _get_all_document_ids_by_tenant_id(
|
||||
cls, tenant_id: str, index_name: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
Retrieves all document IDs with the specified tenant_id, handling pagination.
|
||||
|
||||
Parameters:
|
||||
tenant_id (str): The tenant ID to search for.
|
||||
index_name (str): The name of the index to search in.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of document IDs matching the tenant_id.
|
||||
"""
|
||||
offset = 0
|
||||
limit = 1000 # Vespa's maximum hits per query
|
||||
document_ids = []
|
||||
|
||||
logger.debug(
|
||||
f"Starting document ID retrieval for tenant_id: {tenant_id} in index: {index_name}"
|
||||
)
|
||||
|
||||
while True:
|
||||
# Construct the query to fetch document IDs
|
||||
query_params = {
|
||||
"yql": f'select id from sources * where tenant_id contains "{tenant_id}";',
|
||||
"offset": str(offset),
|
||||
"hits": str(limit),
|
||||
"timeout": "10s",
|
||||
"format": "json",
|
||||
"summary": "id",
|
||||
}
|
||||
|
||||
url = f"{VESPA_APPLICATION_ENDPOINT}/search/"
|
||||
|
||||
logger.debug(
|
||||
f"Querying for document IDs with tenant_id: {tenant_id}, offset: {offset}"
|
||||
)
|
||||
|
||||
with get_vespa_http_client(no_timeout=True) as http_client:
|
||||
response = http_client.get(url, params=query_params)
|
||||
response.raise_for_status()
|
||||
|
||||
search_result = response.json()
|
||||
hits = search_result.get("root", {}).get("children", [])
|
||||
|
||||
if not hits:
|
||||
break
|
||||
|
||||
for hit in hits:
|
||||
doc_id = hit.get("id")
|
||||
if doc_id:
|
||||
document_ids.append(doc_id)
|
||||
|
||||
offset += limit # Move to the next page
|
||||
|
||||
logger.debug(
|
||||
f"Retrieved {len(document_ids)} document IDs for tenant_id: {tenant_id}"
|
||||
)
|
||||
return document_ids
|
||||
|
||||
@classmethod
|
||||
def _apply_deletes_batched(
|
||||
cls,
|
||||
delete_requests: List["_VespaDeleteRequest"],
|
||||
batch_size: int = BATCH_SIZE,
|
||||
) -> None:
|
||||
"""
|
||||
Deletes documents in batches using multiple threads.
|
||||
|
||||
Parameters:
|
||||
delete_requests (List[_VespaDeleteRequest]): The list of delete requests.
|
||||
batch_size (int): The number of documents to delete in each batch.
|
||||
"""
|
||||
|
||||
def _delete_document(
|
||||
delete_request: "_VespaDeleteRequest", http_client: httpx.Client
|
||||
) -> None:
|
||||
logger.debug(f"Deleting document with ID {delete_request.document_id}")
|
||||
response = http_client.delete(
|
||||
delete_request.url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.debug(f"Starting batch deletion for {len(delete_requests)} documents")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
|
||||
with get_vespa_http_client(no_timeout=True) as http_client:
|
||||
for batch_start in range(0, len(delete_requests), batch_size):
|
||||
batch = delete_requests[batch_start : batch_start + batch_size]
|
||||
|
||||
future_to_document_id = {
|
||||
executor.submit(
|
||||
_delete_document,
|
||||
delete_request,
|
||||
http_client,
|
||||
): delete_request.document_id
|
||||
for delete_request in batch
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(
|
||||
future_to_document_id
|
||||
):
|
||||
doc_id = future_to_document_id[future]
|
||||
try:
|
||||
future.result()
|
||||
logger.debug(f"Successfully deleted document: {doc_id}")
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Failed to delete document {doc_id}: {e}")
|
||||
# Optionally, implement retry logic or error handling here
|
||||
|
||||
logger.info("Batch deletion completed")
|
||||
|
||||
|
||||
class _VespaDeleteRequest:
|
||||
def __init__(self, document_id: str, index_name: str) -> None:
|
||||
self.document_id = document_id
|
||||
# Encode the document ID to ensure it's safe for use in the URL
|
||||
encoded_doc_id = urllib.parse.quote_plus(self.document_id)
|
||||
self.url = (
|
||||
f"{VESPA_APPLICATION_ENDPOINT}/document/v1/"
|
||||
f"{index_name}/{index_name}/docid/{encoded_doc_id}"
|
||||
)
|
250
backend/onyx/document_index/vespa/indexing_utils.py
Normal file
250
backend/onyx/document_index/vespa/indexing_utils.py
Normal file
@ -0,0 +1,250 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from http import HTTPStatus
|
||||
|
||||
import httpx
|
||||
from retry import retry
|
||||
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk
|
||||
from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||
from onyx.document_index.vespa_constants import BLURB
|
||||
from onyx.document_index.vespa_constants import BOOST
|
||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||
from onyx.document_index.vespa_constants import CONTENT
|
||||
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
|
||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
||||
from onyx.document_index.vespa_constants import EMBEDDINGS
|
||||
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
|
||||
from onyx.document_index.vespa_constants import METADATA
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
from onyx.document_index.vespa_constants import METADATA_SUFFIX
|
||||
from onyx.document_index.vespa_constants import NUM_THREADS
|
||||
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
|
||||
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
|
||||
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
|
||||
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
|
||||
from onyx.document_index.vespa_constants import SKIP_TITLE_EMBEDDING
|
||||
from onyx.document_index.vespa_constants import SOURCE_LINKS
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _does_document_exist(
|
||||
doc_chunk_id: str,
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
) -> bool:
|
||||
"""Returns whether the document already exists and the users/group whitelists
|
||||
Specifically in this case, document refers to a vespa document which is equivalent to a Onyx
|
||||
chunk. This checks for whether the chunk exists already in the index"""
|
||||
doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
|
||||
doc_fetch_response = http_client.get(doc_url)
|
||||
if doc_fetch_response.status_code == 404:
|
||||
return False
|
||||
|
||||
if doc_fetch_response.status_code != 200:
|
||||
logger.debug(f"Failed to check for document with URL {doc_url}")
|
||||
raise RuntimeError(
|
||||
f"Unexpected fetch document by ID value from Vespa "
|
||||
f"with error {doc_fetch_response.status_code}"
|
||||
f"Index name: {index_name}"
|
||||
f"Doc chunk id: {doc_chunk_id}"
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _vespa_get_updated_at_attribute(t: datetime | None) -> int | None:
|
||||
if not t:
|
||||
return None
|
||||
|
||||
if t.tzinfo != timezone.utc:
|
||||
raise ValueError("Connectors must provide document update time in UTC")
|
||||
|
||||
return int(t.timestamp())
|
||||
|
||||
|
||||
def get_existing_documents_from_chunks(
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
executor: concurrent.futures.ThreadPoolExecutor | None = None,
|
||||
) -> set[str]:
|
||||
external_executor = True
|
||||
|
||||
if not executor:
|
||||
external_executor = False
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
|
||||
|
||||
document_ids: set[str] = set()
|
||||
try:
|
||||
chunk_existence_future = {
|
||||
executor.submit(
|
||||
_does_document_exist,
|
||||
str(get_uuid_from_chunk(chunk)),
|
||||
index_name,
|
||||
http_client,
|
||||
): chunk
|
||||
for chunk in chunks
|
||||
}
|
||||
for future in concurrent.futures.as_completed(chunk_existence_future):
|
||||
chunk = chunk_existence_future[future]
|
||||
chunk_already_existed = future.result()
|
||||
if chunk_already_existed:
|
||||
document_ids.add(chunk.source_document.id)
|
||||
|
||||
finally:
|
||||
if not external_executor:
|
||||
executor.shutdown(wait=True)
|
||||
|
||||
return document_ids
|
||||
|
||||
|
||||
@retry(tries=5, delay=1, backoff=2)
|
||||
def _index_vespa_chunk(
|
||||
chunk: DocMetadataAwareIndexChunk,
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
multitenant: bool,
|
||||
) -> None:
|
||||
json_header = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
document = chunk.source_document
|
||||
|
||||
# No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
|
||||
vespa_chunk_id = str(get_uuid_from_chunk(chunk))
|
||||
embeddings = chunk.embeddings
|
||||
|
||||
embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
|
||||
|
||||
if embeddings.mini_chunk_embeddings:
|
||||
for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
|
||||
embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed
|
||||
|
||||
title = document.get_title_for_document_index()
|
||||
|
||||
vespa_document_fields = {
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
BLURB: remove_invalid_unicode_chars(chunk.blurb),
|
||||
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
||||
SKIP_TITLE_EMBEDDING: not title,
|
||||
# For the BM25 index, the keyword suffix is used, the vector is already generated with the more
|
||||
# natural language representation of the metadata section
|
||||
CONTENT: remove_invalid_unicode_chars(
|
||||
f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}"
|
||||
),
|
||||
# This duplication of `content` is needed for keyword highlighting
|
||||
# Note that it's not exactly the same as the actual content
|
||||
# which contains the title prefix and metadata suffix
|
||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
|
||||
SECTION_CONTINUATION: chunk.section_continuation,
|
||||
LARGE_CHUNK_REFERENCE_IDS: chunk.large_chunk_reference_ids,
|
||||
METADATA: json.dumps(document.metadata),
|
||||
# Save as a list for efficient extraction as an Attribute
|
||||
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
|
||||
METADATA_SUFFIX: chunk.metadata_suffix_keyword,
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
TITLE_EMBEDDING: chunk.title_embedding,
|
||||
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
||||
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
||||
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
|
||||
# the only `set` vespa has is `weightedset`, so we have to give each
|
||||
# element an arbitrary weight
|
||||
# rkuo: acl, docset and boost metadata are also updated through the metadata sync queue
|
||||
# which only calls VespaIndex.update
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
|
||||
BOOST: chunk.boost,
|
||||
}
|
||||
|
||||
if multitenant:
|
||||
if chunk.tenant_id:
|
||||
vespa_document_fields[TENANT_ID] = chunk.tenant_id
|
||||
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}"
|
||||
logger.debug(f'Indexing to URL "{vespa_url}"')
|
||||
res = http_client.post(
|
||||
vespa_url, headers=json_header, json={"fields": vespa_document_fields}
|
||||
)
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
|
||||
)
|
||||
if isinstance(e, httpx.HTTPStatusError):
|
||||
if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE:
|
||||
logger.error(
|
||||
"NOTE: HTTP Status 507 Insufficient Storage usually means "
|
||||
"you need to allocate more memory or disk space to the "
|
||||
"Vespa/index container."
|
||||
)
|
||||
|
||||
raise e
|
||||
|
||||
|
||||
def batch_index_vespa_chunks(
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
multitenant: bool,
|
||||
executor: concurrent.futures.ThreadPoolExecutor | None = None,
|
||||
) -> None:
|
||||
external_executor = True
|
||||
|
||||
if not executor:
|
||||
external_executor = False
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
|
||||
|
||||
try:
|
||||
chunk_index_future = {
|
||||
executor.submit(
|
||||
_index_vespa_chunk, chunk, index_name, http_client, multitenant
|
||||
): chunk
|
||||
for chunk in chunks
|
||||
}
|
||||
for future in concurrent.futures.as_completed(chunk_index_future):
|
||||
# Will raise exception if any indexing raised an exception
|
||||
future.result()
|
||||
|
||||
finally:
|
||||
if not external_executor:
|
||||
executor.shutdown(wait=True)
|
||||
|
||||
|
||||
def clean_chunk_id_copy(
|
||||
chunk: DocMetadataAwareIndexChunk,
|
||||
) -> DocMetadataAwareIndexChunk:
|
||||
clean_chunk = chunk.copy(
|
||||
update={
|
||||
"source_document": chunk.source_document.copy(
|
||||
update={
|
||||
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
return clean_chunk
|
71
backend/onyx/document_index/vespa/shared_utils/utils.py
Normal file
71
backend/onyx/document_index/vespa/shared_utils/utils.py
Normal file
@ -0,0 +1,71 @@
|
||||
import re
|
||||
from typing import cast
|
||||
|
||||
import httpx
|
||||
|
||||
from onyx.configs.app_configs import MANAGED_VESPA
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
|
||||
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
|
||||
|
||||
# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
|
||||
# See here for reference: https://docs.vespa.ai/en/documents.html
|
||||
# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
|
||||
|
||||
# Define allowed ASCII characters
|
||||
ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
|
||||
ALLOWED_ASCII_CHARS[0x9] = True # tab
|
||||
ALLOWED_ASCII_CHARS[0xA] = True # newline
|
||||
ALLOWED_ASCII_CHARS[0xD] = True # carriage return
|
||||
for i in range(0x20, 0x7F):
|
||||
ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars
|
||||
ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed
|
||||
|
||||
|
||||
def is_text_character(codepoint: int) -> bool:
|
||||
"""Returns whether the given codepoint is a valid text character."""
|
||||
if codepoint < 0x80:
|
||||
return ALLOWED_ASCII_CHARS[codepoint]
|
||||
if codepoint < 0xD800:
|
||||
return True
|
||||
if codepoint <= 0xDFFF:
|
||||
return False
|
||||
if codepoint < 0xFDD0:
|
||||
return True
|
||||
if codepoint <= 0xFDEF:
|
||||
return False
|
||||
if codepoint >= 0x10FFFE:
|
||||
return False
|
||||
return (codepoint & 0xFFFF) < 0xFFFE
|
||||
|
||||
|
||||
def replace_invalid_doc_id_characters(text: str) -> str:
|
||||
"""Replaces invalid document ID characters in text."""
|
||||
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
||||
# and users only seem to be running into this error with single quotes
|
||||
return text.replace("'", "_")
|
||||
|
||||
|
||||
def remove_invalid_unicode_chars(text: str) -> str:
|
||||
"""Vespa does not take in unicode chars that aren't valid for XML.
|
||||
This removes them."""
|
||||
_illegal_xml_chars_RE: re.Pattern = re.compile(
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
|
||||
)
|
||||
return _illegal_xml_chars_RE.sub("", text)
|
||||
|
||||
|
||||
def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
|
||||
"""
|
||||
Configure and return an HTTP client for communicating with Vespa,
|
||||
including authentication if needed.
|
||||
"""
|
||||
|
||||
return httpx.Client(
|
||||
cert=cast(tuple[str, str], (VESPA_CLOUD_CERT_PATH, VESPA_CLOUD_KEY_PATH))
|
||||
if MANAGED_VESPA
|
||||
else None,
|
||||
verify=False if not MANAGED_VESPA else True,
|
||||
timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
|
||||
http2=True,
|
||||
)
|
@ -0,0 +1,100 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
|
||||
from onyx.configs.constants import INDEX_SEPARATOR
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
||||
from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> str:
|
||||
def _build_or_filters(key: str, vals: list[str] | None) -> str:
|
||||
if vals is None:
|
||||
return ""
|
||||
|
||||
valid_vals = [val for val in vals if val]
|
||||
if not key or not valid_vals:
|
||||
return ""
|
||||
|
||||
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
|
||||
or_clause = " or ".join(eq_elems)
|
||||
return f"({or_clause}) and "
|
||||
|
||||
def _build_time_filter(
|
||||
cutoff: datetime | None,
|
||||
# Slightly over 3 Months, approximately 1 fiscal quarter
|
||||
untimed_doc_cutoff: timedelta = timedelta(days=92),
|
||||
) -> str:
|
||||
if not cutoff:
|
||||
return ""
|
||||
|
||||
# For Documents that don't have an updated at, filter them out for queries asking for
|
||||
# very recent documents (3 months) default. Documents that don't have an updated at
|
||||
# time are assigned 3 months for time decay value
|
||||
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
|
||||
cutoff_secs = int(cutoff.timestamp())
|
||||
|
||||
if include_untimed:
|
||||
# Documents without updated_at are assigned -1 as their date
|
||||
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
|
||||
|
||||
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
|
||||
|
||||
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
|
||||
|
||||
if filters.tenant_id:
|
||||
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
|
||||
|
||||
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
|
||||
if filters.access_control_list is not None:
|
||||
filter_str += _build_or_filters(
|
||||
ACCESS_CONTROL_LIST, filters.access_control_list
|
||||
)
|
||||
|
||||
source_strs = (
|
||||
[s.value for s in filters.source_type] if filters.source_type else None
|
||||
)
|
||||
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
|
||||
|
||||
tag_attributes = None
|
||||
tags = filters.tags
|
||||
if tags:
|
||||
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
|
||||
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
|
||||
|
||||
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
|
||||
|
||||
filter_str += _build_time_filter(filters.time_cutoff)
|
||||
|
||||
return filter_str
|
||||
|
||||
|
||||
def build_vespa_id_based_retrieval_yql(
|
||||
chunk_request: VespaChunkRequest,
|
||||
) -> str:
|
||||
id_based_retrieval_yql_section = (
|
||||
f'({DOCUMENT_ID} contains "{chunk_request.document_id}"'
|
||||
)
|
||||
|
||||
if chunk_request.is_capped:
|
||||
id_based_retrieval_yql_section += (
|
||||
f" and {CHUNK_ID} >= {chunk_request.min_chunk_ind or 0}"
|
||||
)
|
||||
id_based_retrieval_yql_section += (
|
||||
f" and {CHUNK_ID} <= {chunk_request.max_chunk_ind}"
|
||||
)
|
||||
|
||||
id_based_retrieval_yql_section += ")"
|
||||
return id_based_retrieval_yql_section
|
104
backend/onyx/document_index/vespa_constants.py
Normal file
104
backend/onyx/document_index/vespa_constants.py
Normal file
@ -0,0 +1,104 @@
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_URL
|
||||
from onyx.configs.app_configs import VESPA_CONFIG_SERVER_HOST
|
||||
from onyx.configs.app_configs import VESPA_HOST
|
||||
from onyx.configs.app_configs import VESPA_PORT
|
||||
from onyx.configs.app_configs import VESPA_TENANT_PORT
|
||||
from onyx.configs.constants import SOURCE_TYPE
|
||||
|
||||
VESPA_DIM_REPLACEMENT_PAT = "VARIABLE_DIM"
|
||||
DANSWER_CHUNK_REPLACEMENT_PAT = "DANSWER_CHUNK_NAME"
|
||||
DOCUMENT_REPLACEMENT_PAT = "DOCUMENT_REPLACEMENT"
|
||||
SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER"
|
||||
DATE_REPLACEMENT = "DATE_REPLACEMENT"
|
||||
SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER"
|
||||
TENANT_ID_PAT = "TENANT_ID_REPLACEMENT"
|
||||
|
||||
TENANT_ID_REPLACEMENT = """field tenant_id type string {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}"""
|
||||
# config server
|
||||
|
||||
|
||||
VESPA_CONFIG_SERVER_URL = (
|
||||
VESPA_CLOUD_URL or f"http://{VESPA_CONFIG_SERVER_HOST}:{VESPA_TENANT_PORT}"
|
||||
)
|
||||
VESPA_APPLICATION_ENDPOINT = f"{VESPA_CONFIG_SERVER_URL}/application/v2"
|
||||
|
||||
# main search application
|
||||
VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}"
|
||||
|
||||
|
||||
# danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd
|
||||
DOCUMENT_ID_ENDPOINT = (
|
||||
f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"
|
||||
)
|
||||
|
||||
SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"
|
||||
|
||||
NUM_THREADS = (
|
||||
32 # since Vespa doesn't allow batching of inserts / updates, we use threads
|
||||
)
|
||||
MAX_ID_SEARCH_QUERY_SIZE = 400
|
||||
# Suspect that adding too many "or" conditions will cause Vespa to timeout and return
|
||||
# an empty list of hits (with no error status and coverage: 0 and degraded)
|
||||
MAX_OR_CONDITIONS = 10
|
||||
# up from 500ms for now, since we've seen quite a few timeouts
|
||||
# in the long term, we are looking to improve the performance of Vespa
|
||||
# so that we can bring this back to default
|
||||
VESPA_TIMEOUT = "3s"
|
||||
BATCH_SIZE = 128 # Specific to Vespa
|
||||
|
||||
TENANT_ID = "tenant_id"
|
||||
DOCUMENT_ID = "document_id"
|
||||
CHUNK_ID = "chunk_id"
|
||||
BLURB = "blurb"
|
||||
CONTENT = "content"
|
||||
SOURCE_LINKS = "source_links"
|
||||
SEMANTIC_IDENTIFIER = "semantic_identifier"
|
||||
TITLE = "title"
|
||||
SKIP_TITLE_EMBEDDING = "skip_title"
|
||||
SECTION_CONTINUATION = "section_continuation"
|
||||
EMBEDDINGS = "embeddings"
|
||||
TITLE_EMBEDDING = "title_embedding"
|
||||
ACCESS_CONTROL_LIST = "access_control_list"
|
||||
DOCUMENT_SETS = "document_sets"
|
||||
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
|
||||
METADATA = "metadata"
|
||||
METADATA_LIST = "metadata_list"
|
||||
METADATA_SUFFIX = "metadata_suffix"
|
||||
BOOST = "boost"
|
||||
DOC_UPDATED_AT = "doc_updated_at" # Indexed as seconds since epoch
|
||||
PRIMARY_OWNERS = "primary_owners"
|
||||
SECONDARY_OWNERS = "secondary_owners"
|
||||
RECENCY_BIAS = "recency_bias"
|
||||
HIDDEN = "hidden"
|
||||
|
||||
# Specific to Vespa, needed for highlighting matching keywords / section
|
||||
CONTENT_SUMMARY = "content_summary"
|
||||
|
||||
|
||||
YQL_BASE = (
|
||||
f"select "
|
||||
f"documentid, "
|
||||
f"{DOCUMENT_ID}, "
|
||||
f"{CHUNK_ID}, "
|
||||
f"{BLURB}, "
|
||||
f"{CONTENT}, "
|
||||
f"{SOURCE_TYPE}, "
|
||||
f"{SOURCE_LINKS}, "
|
||||
f"{SEMANTIC_IDENTIFIER}, "
|
||||
f"{TITLE}, "
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{DOC_UPDATED_AT}, "
|
||||
f"{PRIMARY_OWNERS}, "
|
||||
f"{SECONDARY_OWNERS}, "
|
||||
f"{LARGE_CHUNK_REFERENCE_IDS}, "
|
||||
f"{METADATA}, "
|
||||
f"{METADATA_SUFFIX}, "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {{index_name}} where "
|
||||
)
|
Reference in New Issue
Block a user