welcome to onyx

This commit is contained in:
pablodanswer
2024-12-13 09:48:43 -08:00
parent 54dcbfa288
commit 21ec5ed795
813 changed files with 7021 additions and 6824 deletions

View File

View File

@ -0,0 +1,60 @@
import math
import uuid
from sqlalchemy.orm import Session
from onyx.context.search.models import InferenceChunk
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.indexing.models import IndexChunk
DEFAULT_BATCH_SIZE = 30
DEFAULT_INDEX_NAME = "danswer_chunk"
def get_both_index_names(db_session: Session) -> tuple[str, str | None]:
search_settings = get_current_search_settings(db_session)
search_settings_new = get_secondary_search_settings(db_session)
if not search_settings_new:
return search_settings.index_name, None
return search_settings.index_name, search_settings_new.index_name
def translate_boost_count_to_multiplier(boost: int) -> float:
"""Mapping boost integer values to a multiplier according to a sigmoid curve
Piecewise such that at many downvotes, its 0.5x the score and with many upvotes
it is 2x the score. This should be in line with the Vespa calculation."""
# 3 in the equation below stretches it out to hit asymptotes slower
if boost < 0:
# 0.5 + sigmoid -> range of 0.5 to 1
return 0.5 + (1 / (1 + math.exp(-1 * boost / 3)))
# 2 x sigmoid -> range of 1 to 2
return 2 / (1 + math.exp(-1 * boost / 3))
def get_uuid_from_chunk(
chunk: IndexChunk | InferenceChunk, mini_chunk_ind: int = 0
) -> uuid.UUID:
doc_str = (
chunk.document_id
if isinstance(chunk, InferenceChunk)
else chunk.source_document.id
)
# Web parsing URL duplicate catching
if doc_str and doc_str[-1] == "/":
doc_str = doc_str[:-1]
unique_identifier_string = "_".join(
[doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
)
if chunk.large_chunk_reference_ids:
unique_identifier_string += "_large" + "_".join(
[
str(referenced_chunk_id)
for referenced_chunk_id in chunk.large_chunk_reference_ids
]
)
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)

View File

@ -0,0 +1,32 @@
from sqlalchemy.orm import Session
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.vespa.index import VespaIndex
from shared_configs.configs import MULTI_TENANT
def get_default_document_index(
primary_index_name: str,
secondary_index_name: str | None,
) -> DocumentIndex:
"""Primary index is the index that is used for querying/updating etc.
Secondary index is for when both the currently used index and the upcoming
index both need to be updated, updates are applied to both indices"""
# Currently only supporting Vespa
return VespaIndex(
index_name=primary_index_name,
secondary_index_name=secondary_index_name,
multitenant=MULTI_TENANT,
)
def get_current_primary_default_document_index(db_session: Session) -> DocumentIndex:
"""
TODO: Use redis to cache this or something
"""
search_settings = get_current_search_settings(db_session)
return get_default_document_index(
primary_index_name=search_settings.index_name,
secondary_index_name=None,
)

View File

@ -0,0 +1,399 @@
import abc
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from onyx.access.models import DocumentAccess
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.indexing.models import DocMetadataAwareIndexChunk
from shared_configs.model_server_models import Embedding
@dataclass(frozen=True)
class DocumentInsertionRecord:
document_id: str
already_existed: bool
@dataclass(frozen=True)
class VespaChunkRequest:
document_id: str
min_chunk_ind: int | None = None
max_chunk_ind: int | None = None
@property
def is_capped(self) -> bool:
# If the max chunk index is not None, then the chunk request is capped
# If the min chunk index is None, we can assume the min is 0
return self.max_chunk_ind is not None
@property
def range(self) -> int | None:
if self.max_chunk_ind is not None:
return (self.max_chunk_ind - (self.min_chunk_ind or 0)) + 1
return None
@dataclass
class DocumentMetadata:
"""
Document information that needs to be inserted into Postgres on first time encountering this
document during indexing across any of the connectors.
"""
connector_id: int
credential_id: int
document_id: str
semantic_identifier: str
first_link: str
doc_updated_at: datetime | None = None
# Emails, not necessarily attached to users
# Users may not be in Onyx
primary_owners: list[str] | None = None
secondary_owners: list[str] | None = None
from_ingestion_api: bool = False
@dataclass
class VespaDocumentFields:
"""
Specifies fields in Vespa for a document. Fields set to None will be ignored.
Perhaps we should name this in an implementation agnostic fashion, but it's more
understandable like this for now.
"""
# all other fields except these 4 will always be left alone by the update request
access: DocumentAccess | None = None
document_sets: set[str] | None = None
boost: float | None = None
hidden: bool | None = None
@dataclass
class UpdateRequest:
"""
For all document_ids, update the allowed_users and the boost to the new values
Does not update any of the None fields
"""
document_ids: list[str]
# all other fields except these 4 will always be left alone by the update request
access: DocumentAccess | None = None
document_sets: set[str] | None = None
boost: float | None = None
hidden: bool | None = None
class Verifiable(abc.ABC):
"""
Class must implement document index schema verification. For example, verify that all of the
necessary attributes for indexing, querying, filtering, and fields to return from search are
all valid in the schema.
Parameters:
- index_name: The name of the primary index currently used for querying
- secondary_index_name: The name of the secondary index being built in the background, if it
currently exists. Some functions on the document index act on both the primary and
secondary index, some act on just one.
"""
@abc.abstractmethod
def __init__(
self,
index_name: str,
secondary_index_name: str | None,
*args: Any,
**kwargs: Any
) -> None:
super().__init__(*args, **kwargs)
self.index_name = index_name
self.secondary_index_name = secondary_index_name
@abc.abstractmethod
def ensure_indices_exist(
self,
index_embedding_dim: int,
secondary_index_embedding_dim: int | None,
) -> None:
"""
Verify that the document index exists and is consistent with the expectations in the code.
Parameters:
- index_embedding_dim: Vector dimensionality for the vector similarity part of the search
- secondary_index_embedding_dim: Vector dimensionality of the secondary index being built
behind the scenes. The secondary index should only be built when switching
embedding models therefore this dim should be different from the primary index.
"""
raise NotImplementedError
@staticmethod
@abc.abstractmethod
def register_multitenant_indices(
indices: list[str],
embedding_dims: list[int],
) -> None:
"""
Register multitenant indices with the document index.
"""
raise NotImplementedError
class Indexable(abc.ABC):
"""
Class must implement the ability to index document chunks
"""
@abc.abstractmethod
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
fresh_index: bool = False,
) -> set[DocumentInsertionRecord]:
"""
Takes a list of document chunks and indexes them in the document index
NOTE: When a document is reindexed/updated here, it must clear all of the existing document
chunks before reindexing. This is because the document may have gotten shorter since the
last run. Therefore, upserting the first 0 through n chunks may leave some old chunks that
have not been written over.
NOTE: The chunks of a document are never separated into separate index() calls. So there is
no worry of receiving the first 0 through n chunks in one index call and the next n through
m chunks of a docu in the next index call.
NOTE: Due to some asymmetry between the primary and secondary indexing logic, this function
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
it is done automatically outside of this code.
NOTE: The fresh_index parameter, when set to True, assumes no documents have been previously
indexed for the given index/tenant. This can be used to optimize the indexing process for
new or empty indices.
Parameters:
- chunks: Document chunks with all of the information needed for indexing to the document
index.
- fresh_index: Boolean indicating whether this is a fresh index with no existing documents.
Returns:
List of document ids which map to unique documents and are used for deduping chunks
when updating, as well as if the document is newly indexed or already existed and
just updated
"""
raise NotImplementedError
class Deletable(abc.ABC):
"""
Class must implement the ability to delete document by their unique document ids.
"""
@abc.abstractmethod
def delete_single(self, doc_id: str) -> int:
"""
Given a single document id, hard delete it from the document index
Parameters:
- doc_id: document id as specified by the connector
"""
raise NotImplementedError
@abc.abstractmethod
def delete(self, doc_ids: list[str]) -> None:
"""
Given a list of document ids, hard delete them from the document index
Parameters:
- doc_ids: list of document ids as specified by the connector
"""
raise NotImplementedError
class Updatable(abc.ABC):
"""
Class must implement the ability to update certain attributes of a document without needing to
update all of the fields. Specifically, needs to be able to update:
- Access Control List
- Document-set membership
- Boost value (learning from feedback mechanism)
- Whether the document is hidden or not, hidden documents are not returned from search
"""
@abc.abstractmethod
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
"""
Updates all chunks for a document with the specified fields.
None values mean that the field does not need an update.
The rationale for a single update function is that it allows retries and parallelism
to happen at a higher / more strategic level, is simpler to read, and allows
us to individually handle error conditions per document.
Parameters:
- fields: the fields to update in the document. Any field set to None will not be changed.
Return:
None
"""
raise NotImplementedError
@abc.abstractmethod
def update(self, update_requests: list[UpdateRequest]) -> None:
"""
Updates some set of chunks. The document and fields to update are specified in the update
requests. Each update request in the list applies its changes to a list of document ids.
None values mean that the field does not need an update.
Parameters:
- update_requests: for a list of document ids in the update request, apply the same updates
to all of the documents with those ids. This is for bulk handling efficiency. Many
updates are done at the connector level which have many documents for the connector
"""
raise NotImplementedError
class IdRetrievalCapable(abc.ABC):
"""
Class must implement the ability to retrieve either:
- all of the chunks of a document IN ORDER given a document id.
- a specific chunk given a document id and a chunk index (0 based)
"""
@abc.abstractmethod
def id_based_retrieval(
self,
chunk_requests: list[VespaChunkRequest],
filters: IndexFilters,
batch_retrieval: bool = False,
) -> list[InferenceChunkUncleaned]:
"""
Fetch chunk(s) based on document id
NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
of a document. Downstream currently assumes that the chunking does not introduce overlaps
between the chunks. If there are overlaps for the chunks, then the reconstructed document
or extended section will have duplicate segments.
Parameters:
- chunk_requests: requests containing the document id and the chunk range to retrieve
- filters: Filters to apply to retrieval
- batch_retrieval: If True, perform a batch retrieval
Returns:
list of chunks for the document id or the specific chunk by the specified chunk index
and document id
"""
raise NotImplementedError
class HybridCapable(abc.ABC):
"""
Class must implement hybrid (keyword + vector) search functionality
"""
@abc.abstractmethod
def hybrid_retrieval(
self,
query: str,
query_embedding: Embedding,
final_keywords: list[str] | None,
filters: IndexFilters,
hybrid_alpha: float,
time_decay_multiplier: float,
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunkUncleaned]:
"""
Run hybrid search and return a list of inference chunks.
NOTE: the query passed in here is the unprocessed plain text query. Preprocessing is
expected to be handled by this function as it may depend on the index implementation.
Things like query expansion, synonym injection, stop word removal, lemmatization, etc. are
done here.
Parameters:
- query: unmodified user query. This is needed for getting the matching highlighted
keywords
- query_embedding: vector representation of the query, must be of the correct
dimensionality for the primary index
- final_keywords: Final keywords to be used from the query, defaults to query if not set
- filters: standard filter object
- hybrid_alpha: weighting between the keyword and vector search results. It is important
that the two scores are normalized to the same range so that a meaningful
comparison can be made. 1 for 100% weighting on vector score, 0 for 100% weighting
on keyword score.
- time_decay_multiplier: how much to decay the document scores as they age. Some queries
based on the persona settings, will have this be a 2x or 3x of the default
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
Returns:
best matching chunks based on weighted sum of keyword and vector/semantic search scores
"""
raise NotImplementedError
class AdminCapable(abc.ABC):
"""
Class must implement a search for the admin "Explorer" page. The assumption here is that the
admin is not "searching" for knowledge but has some document already in mind. They are either
looking to positively boost it because they know it's a good reference document, looking to
negatively boost it as a way of "deprecating", or hiding the document.
Assuming the admin knows the document name, this search has high emphasis on the title match.
Suggested implementation:
Keyword only, BM25 search with 5x weighting on the title field compared to the contents
"""
@abc.abstractmethod
def admin_retrieval(
self,
query: str,
filters: IndexFilters,
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunkUncleaned]:
"""
Run the special search for the admin document explorer page
Parameters:
- query: unmodified user query. Though in this flow probably unmodified is best
- filters: standard filter object
- num_to_retrieve: number of highest matching chunks to return
- offset: number of highest matching chunks to skip (kind of like pagination)
Returns:
list of best matching chunks for the explorer page query
"""
raise NotImplementedError
class BaseIndex(
Verifiable,
Indexable,
Updatable,
Deletable,
AdminCapable,
IdRetrievalCapable,
abc.ABC,
):
"""
All basic document index functionalities excluding the actual querying approach.
As a summary, document indices need to be able to
- Verify the schema definition is valid
- Index new documents
- Update specific attributes of existing documents
- Delete documents
- Provide a search for the admin document explorer page
- Retrieve documents based on document id
"""
class DocumentIndex(HybridCapable, BaseIndex, abc.ABC):
"""
A valid document index that can plug into all Onyx flows must implement all of these
functionalities, though "technically" it does not need to be keyword or vector capable as
currently all default search flows use Hybrid Search.
"""

View File

@ -0,0 +1,221 @@
schema DANSWER_CHUNK_NAME {
document DANSWER_CHUNK_NAME {
TENANT_ID_REPLACEMENT
# Not to be confused with the UUID generated for this chunk which is called documentid by default
field document_id type string {
indexing: summary | attribute
attribute: fast-search
rank: filter
}
field chunk_id type int {
indexing: summary | attribute
}
# Displayed in the UI as the main identifier for the doc
field semantic_identifier type string {
indexing: summary | attribute
}
# Must have an additional field for whether to skip title embeddings
# This information cannot be extracted from either the title field nor title embedding
field skip_title type bool {
indexing: attribute
}
# May not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index | attribute
index: enable-bm25
}
field content type string {
indexing: summary | index
index: enable-bm25
}
# duplication of `content` is far from ideal, but is needed for
# non-gram based highlighting for now. If the capability to re-use a
# single field to do both is added, `content_summary` should be removed
field content_summary type string {
indexing: summary | index
summary: dynamic
}
# Title embedding (x1)
field title_embedding type tensor<float>(x[VARIABLE_DIM]) {
indexing: attribute | index
attribute {
distance-metric: angular
}
}
# Content embeddings (chunk + optional mini chunks embeddings)
# "t" and "x" are arbitrary names, not special keywords
field embeddings type tensor<float>(t{},x[VARIABLE_DIM]) {
indexing: attribute | index
attribute {
distance-metric: angular
}
}
# Starting section of the doc, currently unused as it has been replaced by match highlighting
field blurb type string {
indexing: summary | attribute
}
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
field source_type type string {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
# Can also index links https://docs.vespa.ai/en/reference/schema-reference.html#attribute
# URL type matching
field source_links type string {
indexing: summary | attribute
}
field section_continuation type bool {
indexing: summary | attribute
}
# Technically this one should be int, but can't change without causing breaks to existing index
field boost type float {
indexing: summary | attribute
}
field hidden type bool {
indexing: summary | attribute
rank: filter
}
# Needs to have a separate Attribute list for efficient filtering
field metadata_list type array<string> {
indexing: summary | attribute
rank:filter
attribute: fast-search
}
# If chunk is a large chunk, this will contain the ids of the smaller chunks
field large_chunk_reference_ids type array<int> {
indexing: summary | attribute
}
field metadata type string {
indexing: summary | attribute
}
field metadata_suffix type string {
indexing: summary | attribute
}
field doc_updated_at type int {
indexing: summary | attribute
}
field primary_owners type array<string> {
indexing : summary | attribute
}
field secondary_owners type array<string> {
indexing : summary | attribute
}
field access_control_list type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field document_sets type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
# If using different tokenization settings, the fieldset has to be removed, and the field must
# be specified in the yql like:
# + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
# + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
# Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
fieldset default {
fields: content, title
}
rank-profile default_rank {
inputs {
query(decay_factor) float
}
function inline document_boost() {
# 0.5 to 2x score: piecewise sigmoid function stretched out by factor of 3
# meaning requires 3x the number of feedback votes to have default sigmoid effect
expression: if(attribute(boost) < 0, 0.5 + (1 / (1 + exp(-attribute(boost) / 3))), 2 / (1 + exp(-attribute(boost) / 3)))
}
function inline document_age() {
# Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found)
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
}
# Document score decays from 1 to 0.75 as age of last updated time increases
function inline recency_bias() {
expression: max(1 / (1 + query(decay_factor) * document_age), 0.75)
}
match-features: recency_bias
}
rank-profile hybrid_searchVARIABLE_DIM inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
}
function title_vector_score() {
expression {
# If no good matching titles, then it should use the context embeddings rather than having some
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
# matching content score getting the full score
max(closeness(field, embeddings), closeness(field, title_embedding))
}
}
# First phase must be vector to allow hits that have no keyword matches
first-phase {
expression: closeness(field, embeddings)
}
# Weighted average between Vector Search and BM-25
global-phase {
expression {
(
# Weighted Vector Similarity Score
(
query(alpha) * (
(query(title_content_ratio) * normalize_linear(title_vector_score))
+
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
)
)
+
# Weighted Keyword Similarity Score
# Note: for the BM25 Title score, it requires decent stopword removal in the query
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
(
(1 - query(alpha)) * (
(query(title_content_ratio) * normalize_linear(bm25(title)))
+
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
)
)
)
# Boost based on user feedback
* document_boost
# Decay factor based on time document was last updated
* recency_bias
}
rerank-count: 1000
}
match-features {
bm25(title)
bm25(content)
closeness(field, title_embedding)
closeness(field, embeddings)
document_boost
recency_bias
closest(embeddings)
}
}
# Used when searching from the admin UI for a specific doc to hide / boost
# Very heavily prioritize title
rank-profile admin_search inherits default, default_rank {
first-phase {
expression: bm25(content) + (5 * bm25(title))
}
}
}

View File

@ -0,0 +1,47 @@
<?xml version="1.0" encoding="utf-8" ?>
<services version="1.0">
<container id="default" version="1.0">
<document-api/>
<search/>
<http>
<server id="default" port="8081"/>
</http>
<nodes>
<node hostalias="danswer-node" />
</nodes>
</container>
<content id="danswer_index" version="1.0">
<redundancy>1</redundancy>
<documents>
<!-- <document type="danswer_chunk" mode="index" /> -->
DOCUMENT_REPLACEMENT
</documents>
<nodes>
<node hostalias="danswer-node" distribution-key="0" />
</nodes>
<tuning>
<resource-limits>
<!-- Default is 75% but this can be increased for Dockerized deployments -->
<!-- https://docs.vespa.ai/en/operations/feed-block.html -->
<disk>0.75</disk>
</resource-limits>
</tuning>
<engine>
<proton>
<tuning>
<searchnode>
<requestthreads>
<persearch>SEARCH_THREAD_NUMBER</persearch>
</requestthreads>
</searchnode>
</tuning>
</proton>
</engine>
<config name="vespa.config.search.summary.juniperrc">
<max_matches>3</max_matches>
<length>750</length>
<surround_max>350</surround_max>
<min_length>300</min_length>
</config>
</content>
</services>

View File

@ -0,0 +1,8 @@
<validation-overrides>
<allow
until="DATE_REPLACEMENT"
comment="We need to be able to create/delete indices for swapping models">schema-removal</allow>
<allow
until="DATE_REPLACEMENT"
comment="We need to be able to update the schema for updates to the Onyx schema">indexing-change</allow>
</validation-overrides>

View File

@ -0,0 +1,430 @@
import json
import string
from collections.abc import Callable
from collections.abc import Mapping
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import cast
import httpx
from retry import retry
from onyx.configs.app_configs import LOG_VESPA_TIMING_INFORMATION
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
build_vespa_filters,
)
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
build_vespa_id_based_retrieval_yql,
)
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
from onyx.document_index.vespa_constants import BLURB
from onyx.document_index.vespa_constants import BOOST
from onyx.document_index.vespa_constants import CHUNK_ID
from onyx.document_index.vespa_constants import CONTENT
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
from onyx.document_index.vespa_constants import DOCUMENT_ID
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
from onyx.document_index.vespa_constants import MAX_ID_SEARCH_QUERY_SIZE
from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
from onyx.document_index.vespa_constants import METADATA
from onyx.document_index.vespa_constants import METADATA_SUFFIX
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
from onyx.document_index.vespa_constants import RECENCY_BIAS
from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
from onyx.document_index.vespa_constants import SOURCE_LINKS
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import YQL_BASE
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
logger = setup_logger()
def _process_dynamic_summary(
dynamic_summary: str, max_summary_length: int = 400
) -> list[str]:
if not dynamic_summary:
return []
current_length = 0
processed_summary: list[str] = []
for summary_section in dynamic_summary.split("<sep />"):
# if we're past the desired max length, break at the last word
if current_length + len(summary_section) >= max_summary_length:
summary_section = summary_section[: max_summary_length - current_length]
summary_section = summary_section.lstrip() # remove any leading whitespace
# handle the case where the truncated section is either just a
# single (partial) word or if it's empty
first_space = summary_section.find(" ")
if first_space == -1:
# add ``...`` to previous section
if processed_summary:
processed_summary[-1] += "..."
break
# handle the valid truncated section case
summary_section = summary_section.rsplit(" ", 1)[0]
if summary_section[-1] in string.punctuation:
summary_section = summary_section[:-1]
summary_section += "..."
processed_summary.append(summary_section)
break
processed_summary.append(summary_section)
current_length += len(summary_section)
return processed_summary
def _vespa_hit_to_inference_chunk(
hit: dict[str, Any], null_score: bool = False
) -> InferenceChunkUncleaned:
fields = cast(dict[str, Any], hit["fields"])
# parse fields that are stored as strings, but are really json / datetime
metadata = json.loads(fields[METADATA]) if METADATA in fields else {}
updated_at = (
datetime.fromtimestamp(fields[DOC_UPDATED_AT], tz=timezone.utc)
if DOC_UPDATED_AT in fields
else None
)
match_highlights = _process_dynamic_summary(
# fallback to regular `content` if the `content_summary` field
# isn't present
dynamic_summary=hit["fields"].get(CONTENT_SUMMARY, hit["fields"][CONTENT]),
)
semantic_identifier = fields.get(SEMANTIC_IDENTIFIER, "")
if not semantic_identifier:
logger.error(
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
)
source_links = fields.get(SOURCE_LINKS, {})
source_links_dict_unprocessed = (
json.loads(source_links) if isinstance(source_links, str) else source_links
)
source_links_dict = {
int(k): v
for k, v in cast(dict[str, str], source_links_dict_unprocessed).items()
}
return InferenceChunkUncleaned(
chunk_id=fields[CHUNK_ID],
blurb=fields.get(BLURB, ""), # Unused
content=fields[CONTENT], # Includes extra title prefix and metadata suffix
source_links=source_links_dict or {0: ""},
section_continuation=fields[SECTION_CONTINUATION],
document_id=fields[DOCUMENT_ID],
source_type=fields[SOURCE_TYPE],
title=fields.get(TITLE),
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
boost=fields.get(BOOST, 1),
recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
score=None if null_score else hit.get("relevance", 0),
hidden=fields.get(HIDDEN, False),
primary_owners=fields.get(PRIMARY_OWNERS),
secondary_owners=fields.get(SECONDARY_OWNERS),
large_chunk_reference_ids=fields.get(LARGE_CHUNK_REFERENCE_IDS, []),
metadata=metadata,
metadata_suffix=fields.get(METADATA_SUFFIX),
match_highlights=match_highlights,
updated_at=updated_at,
)
def _get_chunks_via_visit_api(
chunk_request: VespaChunkRequest,
index_name: str,
filters: IndexFilters,
field_names: list[str] | None = None,
get_large_chunks: bool = False,
) -> list[dict]:
# Constructing the URL for the Visit API
# NOTE: visit API uses the same URL as the document API, but with different params
url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
# build the list of fields to retrieve
field_set_list = (
None
if not field_names
else [f"{index_name}:{field_name}" for field_name in field_names]
)
acl_fieldset_entry = f"{index_name}:{ACCESS_CONTROL_LIST}"
if (
field_set_list
and filters.access_control_list
and acl_fieldset_entry not in field_set_list
):
field_set_list.append(acl_fieldset_entry)
field_set = ",".join(field_set_list) if field_set_list else None
# build filters
selection = f"{index_name}.document_id=='{chunk_request.document_id}'"
if chunk_request.is_capped:
selection += f" and {index_name}.chunk_id>={chunk_request.min_chunk_ind or 0}"
selection += f" and {index_name}.chunk_id<={chunk_request.max_chunk_ind}"
if not get_large_chunks:
selection += f" and {index_name}.large_chunk_reference_ids == null"
# Setting up the selection criteria in the query parameters
params = {
# NOTE: Document Selector Language doesn't allow `contains`, so we can't check
# for the ACL in the selection. Instead, we have to check as a postfilter
"selection": selection,
"continuation": None,
"wantedDocumentCount": 1_000,
"fieldSet": field_set,
}
document_chunks: list[dict] = []
while True:
try:
filtered_params = {k: v for k, v in params.items() if v is not None}
with get_vespa_http_client() as http_client:
response = http_client.get(url, params=filtered_params)
response.raise_for_status()
except httpx.HTTPError as e:
error_base = "Failed to query Vespa"
logger.error(
f"{error_base}:\n"
f"Request URL: {e.request.url}\n"
f"Request Headers: {e.request.headers}\n"
f"Request Payload: {params}\n"
f"Exception: {str(e)}"
)
raise httpx.HTTPError(error_base) from e
# Check if the response contains any documents
response_data = response.json()
if "documents" in response_data:
for document in response_data["documents"]:
if filters.access_control_list:
document_acl = document["fields"].get(ACCESS_CONTROL_LIST)
if not document_acl or not any(
user_acl_entry in document_acl
for user_acl_entry in filters.access_control_list
):
continue
document_chunks.append(document)
# Check for continuation token to handle pagination
if "continuation" in response_data and response_data["continuation"]:
params["continuation"] = response_data["continuation"]
else:
break # Exit loop if no continuation token
return document_chunks
@retry(tries=10, delay=1, backoff=2)
def get_all_vespa_ids_for_document_id(
document_id: str,
index_name: str,
filters: IndexFilters | None = None,
get_large_chunks: bool = False,
) -> list[str]:
document_chunks = _get_chunks_via_visit_api(
chunk_request=VespaChunkRequest(document_id=document_id),
index_name=index_name,
filters=filters or IndexFilters(access_control_list=None),
field_names=[DOCUMENT_ID],
get_large_chunks=get_large_chunks,
)
return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
def parallel_visit_api_retrieval(
index_name: str,
chunk_requests: list[VespaChunkRequest],
filters: IndexFilters,
get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]:
functions_with_args: list[tuple[Callable, tuple]] = [
(
_get_chunks_via_visit_api,
(chunk_request, index_name, filters, get_large_chunks),
)
for chunk_request in chunk_requests
]
parallel_results = run_functions_tuples_in_parallel(
functions_with_args, allow_failures=True
)
# Any failures to retrieve would give a None, drop the Nones and empty lists
vespa_chunk_sets = [res for res in parallel_results if res]
flattened_vespa_chunks = []
for chunk_set in vespa_chunk_sets:
flattened_vespa_chunks.extend(chunk_set)
inference_chunks = [
_vespa_hit_to_inference_chunk(chunk, null_score=True)
for chunk in flattened_vespa_chunks
]
return inference_chunks
@retry(tries=3, delay=1, backoff=2)
def query_vespa(
query_params: Mapping[str, str | int | float]
) -> list[InferenceChunkUncleaned]:
if "query" in query_params and not cast(str, query_params["query"]).strip():
raise ValueError("No/empty query received")
params = dict(
**query_params,
**{
"presentation.timing": True,
}
if LOG_VESPA_TIMING_INFORMATION
else {},
)
try:
with get_vespa_http_client() as http_client:
response = http_client.post(SEARCH_ENDPOINT, json=params)
response.raise_for_status()
except httpx.HTTPError as e:
error_base = "Failed to query Vespa"
logger.error(
f"{error_base}:\n"
f"Request URL: {e.request.url}\n"
f"Request Headers: {e.request.headers}\n"
f"Request Payload: {params}\n"
f"Exception: {str(e)}"
)
raise httpx.HTTPError(error_base) from e
response_json: dict[str, Any] = response.json()
if LOG_VESPA_TIMING_INFORMATION:
logger.debug("Vespa timing info: %s", response_json.get("timing"))
hits = response_json["root"].get("children", [])
if not hits:
logger.warning(
f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
)
logger.debug(f"Vespa Response: {response.text}")
for hit in hits:
if hit["fields"].get(CONTENT) is None:
identifier = hit["fields"].get("documentid") or hit["id"]
logger.error(
f"Vespa Index with Vespa ID {identifier} has no contents. "
f"This is invalid because the vector is not meaningful and keywordsearch cannot "
f"fetch this document"
)
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
# Good Debugging Spot
return inference_chunks
def _get_chunks_via_batch_search(
index_name: str,
chunk_requests: list[VespaChunkRequest],
filters: IndexFilters,
get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]:
if not chunk_requests:
return []
filters_str = build_vespa_filters(filters=filters, include_hidden=True)
yql = (
YQL_BASE.format(index_name=index_name)
+ filters_str
+ build_vespa_id_based_retrieval_yql(chunk_requests[0])
)
chunk_requests.pop(0)
for request in chunk_requests:
yql += " or " + build_vespa_id_based_retrieval_yql(request)
params: dict[str, str | int | float] = {
"yql": yql,
"hits": MAX_ID_SEARCH_QUERY_SIZE,
}
inference_chunks = query_vespa(params)
if not get_large_chunks:
inference_chunks = [
chunk for chunk in inference_chunks if not chunk.large_chunk_reference_ids
]
inference_chunks.sort(key=lambda chunk: chunk.chunk_id)
return inference_chunks
def batch_search_api_retrieval(
index_name: str,
chunk_requests: list[VespaChunkRequest],
filters: IndexFilters,
get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]:
retrieved_chunks: list[InferenceChunkUncleaned] = []
capped_requests: list[VespaChunkRequest] = []
uncapped_requests: list[VespaChunkRequest] = []
chunk_count = 0
for req_ind, request in enumerate(chunk_requests, start=1):
# All requests without a chunk range are uncapped
# Uncapped requests are retrieved using the Visit API
range = request.range
if range is None:
uncapped_requests.append(request)
continue
if (
chunk_count + range > MAX_ID_SEARCH_QUERY_SIZE
or req_ind % MAX_OR_CONDITIONS == 0
):
retrieved_chunks.extend(
_get_chunks_via_batch_search(
index_name=index_name,
chunk_requests=capped_requests,
filters=filters,
get_large_chunks=get_large_chunks,
)
)
capped_requests = []
chunk_count = 0
capped_requests.append(request)
chunk_count += range
if capped_requests:
retrieved_chunks.extend(
_get_chunks_via_batch_search(
index_name=index_name,
chunk_requests=capped_requests,
filters=filters,
get_large_chunks=get_large_chunks,
)
)
if uncapped_requests:
logger.debug(f"Retrieving {len(uncapped_requests)} uncapped requests")
retrieved_chunks.extend(
parallel_visit_api_retrieval(
index_name, uncapped_requests, filters, get_large_chunks
)
)
return retrieved_chunks

View File

@ -0,0 +1,65 @@
import concurrent.futures
import httpx
from retry import retry
from onyx.document_index.vespa.chunk_retrieval import (
get_all_vespa_ids_for_document_id,
)
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import NUM_THREADS
from onyx.utils.logger import setup_logger
logger = setup_logger()
CONTENT_SUMMARY = "content_summary"
@retry(tries=3, delay=1, backoff=2)
def _delete_vespa_doc_chunks(
document_id: str, index_name: str, http_client: httpx.Client
) -> None:
doc_chunk_ids = get_all_vespa_ids_for_document_id(
document_id=document_id,
index_name=index_name,
get_large_chunks=True,
)
for chunk_id in doc_chunk_ids:
try:
res = http_client.delete(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{chunk_id}"
)
res.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(f"Failed to delete chunk, details: {e.response.text}")
raise
def delete_vespa_docs(
document_ids: list[str],
index_name: str,
http_client: httpx.Client,
executor: concurrent.futures.ThreadPoolExecutor | None = None,
) -> None:
external_executor = True
if not executor:
external_executor = False
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
try:
doc_deletion_future = {
executor.submit(
_delete_vespa_doc_chunks, doc_id, index_name, http_client
): doc_id
for doc_id in document_ids
}
for future in concurrent.futures.as_completed(doc_deletion_future):
# Will raise exception if the deletion raised an exception
future.result()
finally:
if not external_executor:
executor.shutdown(wait=True)

View File

@ -0,0 +1,915 @@
import concurrent.futures
import io
import logging
import os
import re
import time
import urllib
import zipfile
from dataclasses import dataclass
from datetime import datetime
from datetime import timedelta
from typing import BinaryIO
from typing import cast
from typing import List
import httpx # type: ignore
import requests # type: ignore
from onyx.configs.app_configs import DOCUMENT_INDEX_NAME
from onyx.configs.chat_configs import DOC_TIME_DECAY
from onyx.configs.chat_configs import NUM_RETURNED_HITS
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
from onyx.configs.constants import KV_REINDEX_KEY
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentInsertionRecord
from onyx.document_index.interfaces import UpdateRequest
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
from onyx.document_index.vespa.chunk_retrieval import (
get_all_vespa_ids_for_document_id,
)
from onyx.document_index.vespa.chunk_retrieval import (
parallel_visit_api_retrieval,
)
from onyx.document_index.vespa.chunk_retrieval import query_vespa
from onyx.document_index.vespa.deletion import delete_vespa_docs
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
from onyx.document_index.vespa.indexing_utils import (
get_existing_documents_from_chunks,
)
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.utils import (
replace_invalid_doc_id_characters,
)
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
build_vespa_filters,
)
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
from onyx.document_index.vespa_constants import BATCH_SIZE
from onyx.document_index.vespa_constants import BOOST
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
from onyx.document_index.vespa_constants import DANSWER_CHUNK_REPLACEMENT_PAT
from onyx.document_index.vespa_constants import DATE_REPLACEMENT
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import DOCUMENT_REPLACEMENT_PAT
from onyx.document_index.vespa_constants import DOCUMENT_SETS
from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import NUM_THREADS
from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
from onyx.document_index.vespa_constants import TENANT_ID_PAT
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
from onyx.document_index.vespa_constants import VESPA_TIMEOUT
from onyx.document_index.vespa_constants import YQL_BASE
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.key_value_store.factory import get_kv_store
from onyx.utils.batching import batch_generator
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
from shared_configs.model_server_models import Embedding
logger = setup_logger()
# Set the logging level to WARNING to ignore INFO and DEBUG logs
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.WARNING)
@dataclass
class _VespaUpdateRequest:
document_id: str
url: str
update_request: dict[str, dict]
def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO:
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
for filename, content in file_contents.items():
zipf.writestr(filename, content)
zip_buffer.seek(0)
return zip_buffer
def _create_document_xml_lines(doc_names: list[str | None] | list[str]) -> str:
doc_lines = [
f'<document type="{doc_name}" mode="index" />'
for doc_name in doc_names
if doc_name
]
return "\n".join(doc_lines)
def add_ngrams_to_schema(schema_content: str) -> str:
# Add the match blocks containing gram and gram-size to title and content fields
schema_content = re.sub(
r"(field title type string \{[^}]*indexing: summary \| index \| attribute)",
r"\1\n match {\n gram\n gram-size: 3\n }",
schema_content,
)
schema_content = re.sub(
r"(field content type string \{[^}]*indexing: summary \| index)",
r"\1\n match {\n gram\n gram-size: 3\n }",
schema_content,
)
return schema_content
class VespaIndex(DocumentIndex):
def __init__(
self,
index_name: str,
secondary_index_name: str | None,
multitenant: bool = False,
) -> None:
self.index_name = index_name
self.secondary_index_name = secondary_index_name
self.multitenant = multitenant
self.http_client = get_vespa_http_client()
def ensure_indices_exist(
self,
index_embedding_dim: int,
secondary_index_embedding_dim: int | None,
) -> None:
if MULTI_TENANT:
logger.info(
"Skipping Vespa index seup for multitenant (would wipe all indices)"
)
return None
deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate"
logger.notice(f"Deploying Vespa application package to {deploy_url}")
vespa_schema_path = os.path.join(
os.getcwd(), "onyx", "document_index", "vespa", "app_config"
)
schema_file = os.path.join(vespa_schema_path, "schemas", "danswer_chunk.sd")
services_file = os.path.join(vespa_schema_path, "services.xml")
overrides_file = os.path.join(vespa_schema_path, "validation-overrides.xml")
with open(services_file, "r") as services_f:
services_template = services_f.read()
schema_names = [self.index_name, self.secondary_index_name]
doc_lines = _create_document_xml_lines(schema_names)
services = services_template.replace(DOCUMENT_REPLACEMENT_PAT, doc_lines)
services = services.replace(
SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS)
)
kv_store = get_kv_store()
needs_reindexing = False
try:
needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY))
except Exception:
logger.debug("Could not load the reindexing flag. Using ngrams")
with open(overrides_file, "r") as overrides_f:
overrides_template = overrides_f.read()
# Vespa requires an override to erase data including the indices we're no longer using
# It also has a 30 day cap from current so we set it to 7 dynamically
now = datetime.now()
date_in_7_days = now + timedelta(days=7)
formatted_date = date_in_7_days.strftime("%Y-%m-%d")
overrides = overrides_template.replace(DATE_REPLACEMENT, formatted_date)
zip_dict = {
"services.xml": services.encode("utf-8"),
"validation-overrides.xml": overrides.encode("utf-8"),
}
with open(schema_file, "r") as schema_f:
schema_template = schema_f.read()
schema_template = schema_template.replace(TENANT_ID_PAT, "")
schema = schema_template.replace(
DANSWER_CHUNK_REPLACEMENT_PAT, self.index_name
).replace(VESPA_DIM_REPLACEMENT_PAT, str(index_embedding_dim))
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
schema = schema.replace(TENANT_ID_PAT, "")
zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")
if self.secondary_index_name:
upcoming_schema = schema_template.replace(
DANSWER_CHUNK_REPLACEMENT_PAT, self.secondary_index_name
).replace(VESPA_DIM_REPLACEMENT_PAT, str(secondary_index_embedding_dim))
zip_dict[f"schemas/{schema_names[1]}.sd"] = upcoming_schema.encode("utf-8")
zip_file = in_memory_zip_from_file_bytes(zip_dict)
headers = {"Content-Type": "application/zip"}
response = requests.post(deploy_url, headers=headers, data=zip_file)
if response.status_code != 200:
raise RuntimeError(
f"Failed to prepare Vespa Onyx Index. Response: {response.text}"
)
@staticmethod
def register_multitenant_indices(
indices: list[str],
embedding_dims: list[int],
) -> None:
if not MULTI_TENANT:
raise ValueError("Multi-tenant is not enabled")
deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate"
logger.info(f"Deploying Vespa application package to {deploy_url}")
vespa_schema_path = os.path.join(
os.getcwd(), "onyx", "document_index", "vespa", "app_config"
)
schema_file = os.path.join(vespa_schema_path, "schemas", "danswer_chunk.sd")
services_file = os.path.join(vespa_schema_path, "services.xml")
overrides_file = os.path.join(vespa_schema_path, "validation-overrides.xml")
with open(services_file, "r") as services_f:
services_template = services_f.read()
# Generate schema names from index settings
schema_names = [index_name for index_name in indices]
full_schemas = schema_names
doc_lines = _create_document_xml_lines(full_schemas)
services = services_template.replace(DOCUMENT_REPLACEMENT_PAT, doc_lines)
services = services.replace(
SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS)
)
kv_store = get_kv_store()
needs_reindexing = False
try:
needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY))
except Exception:
logger.debug("Could not load the reindexing flag. Using ngrams")
with open(overrides_file, "r") as overrides_f:
overrides_template = overrides_f.read()
# Vespa requires an override to erase data including the indices we're no longer using
# It also has a 30 day cap from current so we set it to 7 dynamically
now = datetime.now()
date_in_7_days = now + timedelta(days=7)
formatted_date = date_in_7_days.strftime("%Y-%m-%d")
overrides = overrides_template.replace(DATE_REPLACEMENT, formatted_date)
zip_dict = {
"services.xml": services.encode("utf-8"),
"validation-overrides.xml": overrides.encode("utf-8"),
}
with open(schema_file, "r") as schema_f:
schema_template = schema_f.read()
for i, index_name in enumerate(indices):
embedding_dim = embedding_dims[i]
logger.info(
f"Creating index: {index_name} with embedding dimension: {embedding_dim}"
)
schema = schema_template.replace(
DANSWER_CHUNK_REPLACEMENT_PAT, index_name
).replace(VESPA_DIM_REPLACEMENT_PAT, str(embedding_dim))
schema = schema.replace(
TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
)
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")
zip_file = in_memory_zip_from_file_bytes(zip_dict)
headers = {"Content-Type": "application/zip"}
response = requests.post(deploy_url, headers=headers, data=zip_file)
if response.status_code != 200:
raise RuntimeError(
f"Failed to prepare Vespa Onyx Indexes. Response: {response.text}"
)
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
fresh_index: bool = False,
) -> set[DocumentInsertionRecord]:
"""Receive a list of chunks from a batch of documents and index the chunks into Vespa along
with updating the associated permissions. Assumes that a document will not be split into
multiple chunk batches calling this function multiple times, otherwise only the last set of
chunks will be kept"""
# IMPORTANT: This must be done one index at a time, do not use secondary index here
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
existing_docs: set[str] = set()
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
with (
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
get_vespa_http_client() as http_client,
):
if not fresh_index:
# Check for existing documents, existing documents need to have all of their chunks deleted
# prior to indexing as the document size (num chunks) may have shrunk
first_chunks = [
chunk for chunk in cleaned_chunks if chunk.chunk_id == 0
]
for chunk_batch in batch_generator(first_chunks, BATCH_SIZE):
existing_docs.update(
get_existing_documents_from_chunks(
chunks=chunk_batch,
index_name=self.index_name,
http_client=http_client,
executor=executor,
)
)
for doc_id_batch in batch_generator(existing_docs, BATCH_SIZE):
delete_vespa_docs(
document_ids=doc_id_batch,
index_name=self.index_name,
http_client=http_client,
executor=executor,
)
for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
batch_index_vespa_chunks(
chunks=chunk_batch,
index_name=self.index_name,
http_client=http_client,
multitenant=self.multitenant,
executor=executor,
)
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
return {
DocumentInsertionRecord(
document_id=doc_id,
already_existed=doc_id in existing_docs,
)
for doc_id in all_doc_ids
}
@staticmethod
def _apply_updates_batched(
updates: list[_VespaUpdateRequest],
batch_size: int = BATCH_SIZE,
) -> None:
"""Runs a batch of updates in parallel via the ThreadPoolExecutor."""
def _update_chunk(
update: _VespaUpdateRequest, http_client: httpx.Client
) -> httpx.Response:
logger.debug(
f"Updating with request to {update.url} with body {update.update_request}"
)
return http_client.put(
update.url,
headers={"Content-Type": "application/json"},
json=update.update_request,
)
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for
# indexing / updates / deletes since we have to make a large volume of requests.
with (
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
get_vespa_http_client() as http_client,
):
for update_batch in batch_generator(updates, batch_size):
future_to_document_id = {
executor.submit(
_update_chunk,
update,
http_client,
): update.document_id
for update in update_batch
}
for future in concurrent.futures.as_completed(future_to_document_id):
res = future.result()
try:
res.raise_for_status()
except requests.HTTPError as e:
failure_msg = f"Failed to update document: {future_to_document_id[future]}"
raise requests.HTTPError(failure_msg) from e
def update(self, update_requests: list[UpdateRequest]) -> None:
logger.debug(f"Updating {len(update_requests)} documents in Vespa")
# Handle Vespa character limitations
# Mutating update_requests but it's not used later anyway
for update_request in update_requests:
update_request.document_ids = [
replace_invalid_doc_id_characters(doc_id)
for doc_id in update_request.document_ids
]
update_start = time.monotonic()
processed_updates_requests: list[_VespaUpdateRequest] = []
all_doc_chunk_ids: dict[str, list[str]] = {}
# Fetch all chunks for each document ahead of time
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
chunk_id_start_time = time.monotonic()
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
future_to_doc_chunk_ids = {
executor.submit(
get_all_vespa_ids_for_document_id,
document_id=document_id,
index_name=index_name,
filters=None,
get_large_chunks=True,
): (document_id, index_name)
for index_name in index_names
for update_request in update_requests
for document_id in update_request.document_ids
}
for future in concurrent.futures.as_completed(future_to_doc_chunk_ids):
document_id, index_name = future_to_doc_chunk_ids[future]
try:
doc_chunk_ids = future.result()
if document_id not in all_doc_chunk_ids:
all_doc_chunk_ids[document_id] = []
all_doc_chunk_ids[document_id].extend(doc_chunk_ids)
except Exception as e:
logger.error(
f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}"
)
logger.debug(
f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs"
)
# Build the _VespaUpdateRequest objects
for update_request in update_requests:
update_dict: dict[str, dict] = {"fields": {}}
if update_request.boost is not None:
update_dict["fields"][BOOST] = {"assign": update_request.boost}
if update_request.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {
document_set: 1 for document_set in update_request.document_sets
}
}
if update_request.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {
acl_entry: 1 for acl_entry in update_request.access.to_acl()
}
}
if update_request.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": update_request.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update")
continue
for document_id in update_request.document_ids:
for doc_chunk_id in all_doc_chunk_ids[document_id]:
processed_updates_requests.append(
_VespaUpdateRequest(
document_id=document_id,
url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}",
update_request=update_dict,
)
)
self._apply_updates_batched(processed_updates_requests)
logger.debug(
"Finished updating Vespa documents in %.2f seconds",
time.monotonic() - update_start,
)
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
"""Note: if the document id does not exist, the update will be a no-op and the
function will complete with no errors or exceptions.
Handle other exceptions if you wish to implement retry behavior
"""
total_chunks_updated = 0
# Handle Vespa character limitations
# Mutating update_request but it's not used later anyway
normalized_doc_id = replace_invalid_doc_id_characters(doc_id)
# Build the _VespaUpdateRequest objects
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update")
return 0
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with get_vespa_http_client() as http_client:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{normalized_doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
)
while True:
try:
resp = http_client.put(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}",
params=params,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to update chunks, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_updated = resp_data["documentCount"]
total_chunks_updated += chunks_updated
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.update_single: "
f"index={index_name} "
f"doc={normalized_doc_id} "
f"chunks_updated={total_chunks_updated}"
)
return total_chunks_updated
def delete(self, doc_ids: list[str]) -> None:
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
with get_vespa_http_client() as http_client:
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
for index_name in index_names:
delete_vespa_docs(
document_ids=doc_ids, index_name=index_name, http_client=http_client
)
return
def delete_single(self, doc_id: str) -> int:
"""Possibly faster overall than the delete method due to using a single
delete call with a selection query."""
total_chunks_deleted = 0
# Vespa deletion is poorly documented ... luckily we found this
# https://docs.vespa.ai/en/operations/batch-delete.html#example
doc_id = replace_invalid_doc_id_characters(doc_id)
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with get_vespa_http_client() as http_client:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
)
while True:
try:
resp = http_client.delete(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
params=params,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to delete chunk, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_deleted = resp_data["documentCount"]
total_chunks_deleted += chunks_deleted
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.delete_single: "
f"index={index_name} "
f"doc={doc_id} "
f"chunks_deleted={total_chunks_deleted}"
)
return total_chunks_deleted
def id_based_retrieval(
self,
chunk_requests: list[VespaChunkRequest],
filters: IndexFilters,
batch_retrieval: bool = False,
get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]:
if batch_retrieval:
return batch_search_api_retrieval(
index_name=self.index_name,
chunk_requests=chunk_requests,
filters=filters,
get_large_chunks=get_large_chunks,
)
return parallel_visit_api_retrieval(
index_name=self.index_name,
chunk_requests=chunk_requests,
filters=filters,
get_large_chunks=get_large_chunks,
)
def hybrid_retrieval(
self,
query: str,
query_embedding: Embedding,
final_keywords: list[str] | None,
filters: IndexFilters,
hybrid_alpha: float,
time_decay_multiplier: float,
num_to_retrieve: int,
offset: int = 0,
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
) -> list[InferenceChunkUncleaned]:
vespa_where_clauses = build_vespa_filters(filters)
# Needs to be at least as much as the value set in Vespa schema config
target_hits = max(10 * num_to_retrieve, 1000)
yql = (
YQL_BASE.format(index_name=self.index_name)
+ vespa_where_clauses
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+ f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
)
final_query = " ".join(final_keywords) if final_keywords else query
logger.debug(f"Query YQL: {yql}")
params: dict[str, str | int | float] = {
"yql": yql,
"query": final_query,
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY * time_decay_multiplier),
"input.query(alpha)": hybrid_alpha,
"input.query(title_content_ratio)": title_content_ratio
if title_content_ratio is not None
else TITLE_CONTENT_RATIO,
"hits": num_to_retrieve,
"offset": offset,
"ranking.profile": f"hybrid_search{len(query_embedding)}",
"timeout": VESPA_TIMEOUT,
}
return query_vespa(params)
def admin_retrieval(
self,
query: str,
filters: IndexFilters,
num_to_retrieve: int = NUM_RETURNED_HITS,
offset: int = 0,
) -> list[InferenceChunkUncleaned]:
vespa_where_clauses = build_vespa_filters(filters, include_hidden=True)
yql = (
YQL_BASE.format(index_name=self.index_name)
+ vespa_where_clauses
+ '({grammar: "weakAnd"}userInput(@query) '
# `({defaultIndex: "content_summary"}userInput(@query))` section is
# needed for highlighting while the N-gram highlighting is broken /
# not working as desired
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
)
params: dict[str, str | int] = {
"yql": yql,
"query": query,
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "admin_search",
"timeout": VESPA_TIMEOUT,
}
return query_vespa(params)
@classmethod
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None:
"""
Deletes all entries in the specified index with the given tenant_id.
Parameters:
tenant_id (str): The tenant ID whose documents are to be deleted.
index_name (str): The name of the index from which to delete documents.
"""
logger.info(
f"Deleting entries with tenant_id: {tenant_id} from index: {index_name}"
)
# Step 1: Retrieve all document IDs with the given tenant_id
document_ids = cls._get_all_document_ids_by_tenant_id(tenant_id, index_name)
if not document_ids:
logger.info(
f"No documents found with tenant_id: {tenant_id} in index: {index_name}"
)
return
# Step 2: Delete documents in batches
delete_requests = [
_VespaDeleteRequest(document_id=doc_id, index_name=index_name)
for doc_id in document_ids
]
cls._apply_deletes_batched(delete_requests)
@classmethod
def _get_all_document_ids_by_tenant_id(
cls, tenant_id: str, index_name: str
) -> List[str]:
"""
Retrieves all document IDs with the specified tenant_id, handling pagination.
Parameters:
tenant_id (str): The tenant ID to search for.
index_name (str): The name of the index to search in.
Returns:
List[str]: A list of document IDs matching the tenant_id.
"""
offset = 0
limit = 1000 # Vespa's maximum hits per query
document_ids = []
logger.debug(
f"Starting document ID retrieval for tenant_id: {tenant_id} in index: {index_name}"
)
while True:
# Construct the query to fetch document IDs
query_params = {
"yql": f'select id from sources * where tenant_id contains "{tenant_id}";',
"offset": str(offset),
"hits": str(limit),
"timeout": "10s",
"format": "json",
"summary": "id",
}
url = f"{VESPA_APPLICATION_ENDPOINT}/search/"
logger.debug(
f"Querying for document IDs with tenant_id: {tenant_id}, offset: {offset}"
)
with get_vespa_http_client(no_timeout=True) as http_client:
response = http_client.get(url, params=query_params)
response.raise_for_status()
search_result = response.json()
hits = search_result.get("root", {}).get("children", [])
if not hits:
break
for hit in hits:
doc_id = hit.get("id")
if doc_id:
document_ids.append(doc_id)
offset += limit # Move to the next page
logger.debug(
f"Retrieved {len(document_ids)} document IDs for tenant_id: {tenant_id}"
)
return document_ids
@classmethod
def _apply_deletes_batched(
cls,
delete_requests: List["_VespaDeleteRequest"],
batch_size: int = BATCH_SIZE,
) -> None:
"""
Deletes documents in batches using multiple threads.
Parameters:
delete_requests (List[_VespaDeleteRequest]): The list of delete requests.
batch_size (int): The number of documents to delete in each batch.
"""
def _delete_document(
delete_request: "_VespaDeleteRequest", http_client: httpx.Client
) -> None:
logger.debug(f"Deleting document with ID {delete_request.document_id}")
response = http_client.delete(
delete_request.url,
headers={"Content-Type": "application/json"},
)
response.raise_for_status()
logger.debug(f"Starting batch deletion for {len(delete_requests)} documents")
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
with get_vespa_http_client(no_timeout=True) as http_client:
for batch_start in range(0, len(delete_requests), batch_size):
batch = delete_requests[batch_start : batch_start + batch_size]
future_to_document_id = {
executor.submit(
_delete_document,
delete_request,
http_client,
): delete_request.document_id
for delete_request in batch
}
for future in concurrent.futures.as_completed(
future_to_document_id
):
doc_id = future_to_document_id[future]
try:
future.result()
logger.debug(f"Successfully deleted document: {doc_id}")
except httpx.HTTPError as e:
logger.error(f"Failed to delete document {doc_id}: {e}")
# Optionally, implement retry logic or error handling here
logger.info("Batch deletion completed")
class _VespaDeleteRequest:
def __init__(self, document_id: str, index_name: str) -> None:
self.document_id = document_id
# Encode the document ID to ensure it's safe for use in the URL
encoded_doc_id = urllib.parse.quote_plus(self.document_id)
self.url = (
f"{VESPA_APPLICATION_ENDPOINT}/document/v1/"
f"{index_name}/{index_name}/docid/{encoded_doc_id}"
)

View File

@ -0,0 +1,250 @@
import concurrent.futures
import json
from datetime import datetime
from datetime import timezone
from http import HTTPStatus
import httpx
from retry import retry
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from onyx.document_index.document_index_utils import get_uuid_from_chunk
from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
from onyx.document_index.vespa.shared_utils.utils import (
replace_invalid_doc_id_characters,
)
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
from onyx.document_index.vespa_constants import BLURB
from onyx.document_index.vespa_constants import BOOST
from onyx.document_index.vespa_constants import CHUNK_ID
from onyx.document_index.vespa_constants import CONTENT
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
from onyx.document_index.vespa_constants import DOCUMENT_ID
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import DOCUMENT_SETS
from onyx.document_index.vespa_constants import EMBEDDINGS
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
from onyx.document_index.vespa_constants import METADATA
from onyx.document_index.vespa_constants import METADATA_LIST
from onyx.document_index.vespa_constants import METADATA_SUFFIX
from onyx.document_index.vespa_constants import NUM_THREADS
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
from onyx.document_index.vespa_constants import SKIP_TITLE_EMBEDDING
from onyx.document_index.vespa_constants import SOURCE_LINKS
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.utils.logger import setup_logger
logger = setup_logger()
@retry(tries=3, delay=1, backoff=2)
def _does_document_exist(
doc_chunk_id: str,
index_name: str,
http_client: httpx.Client,
) -> bool:
"""Returns whether the document already exists and the users/group whitelists
Specifically in this case, document refers to a vespa document which is equivalent to a Onyx
chunk. This checks for whether the chunk exists already in the index"""
doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
doc_fetch_response = http_client.get(doc_url)
if doc_fetch_response.status_code == 404:
return False
if doc_fetch_response.status_code != 200:
logger.debug(f"Failed to check for document with URL {doc_url}")
raise RuntimeError(
f"Unexpected fetch document by ID value from Vespa "
f"with error {doc_fetch_response.status_code}"
f"Index name: {index_name}"
f"Doc chunk id: {doc_chunk_id}"
)
return True
def _vespa_get_updated_at_attribute(t: datetime | None) -> int | None:
if not t:
return None
if t.tzinfo != timezone.utc:
raise ValueError("Connectors must provide document update time in UTC")
return int(t.timestamp())
def get_existing_documents_from_chunks(
chunks: list[DocMetadataAwareIndexChunk],
index_name: str,
http_client: httpx.Client,
executor: concurrent.futures.ThreadPoolExecutor | None = None,
) -> set[str]:
external_executor = True
if not executor:
external_executor = False
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
document_ids: set[str] = set()
try:
chunk_existence_future = {
executor.submit(
_does_document_exist,
str(get_uuid_from_chunk(chunk)),
index_name,
http_client,
): chunk
for chunk in chunks
}
for future in concurrent.futures.as_completed(chunk_existence_future):
chunk = chunk_existence_future[future]
chunk_already_existed = future.result()
if chunk_already_existed:
document_ids.add(chunk.source_document.id)
finally:
if not external_executor:
executor.shutdown(wait=True)
return document_ids
@retry(tries=5, delay=1, backoff=2)
def _index_vespa_chunk(
chunk: DocMetadataAwareIndexChunk,
index_name: str,
http_client: httpx.Client,
multitenant: bool,
) -> None:
json_header = {
"Content-Type": "application/json",
}
document = chunk.source_document
# No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
vespa_chunk_id = str(get_uuid_from_chunk(chunk))
embeddings = chunk.embeddings
embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
if embeddings.mini_chunk_embeddings:
for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed
title = document.get_title_for_document_index()
vespa_document_fields = {
DOCUMENT_ID: document.id,
CHUNK_ID: chunk.chunk_id,
BLURB: remove_invalid_unicode_chars(chunk.blurb),
TITLE: remove_invalid_unicode_chars(title) if title else None,
SKIP_TITLE_EMBEDDING: not title,
# For the BM25 index, the keyword suffix is used, the vector is already generated with the more
# natural language representation of the metadata section
CONTENT: remove_invalid_unicode_chars(
f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}"
),
# This duplication of `content` is needed for keyword highlighting
# Note that it's not exactly the same as the actual content
# which contains the title prefix and metadata suffix
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
SECTION_CONTINUATION: chunk.section_continuation,
LARGE_CHUNK_REFERENCE_IDS: chunk.large_chunk_reference_ids,
METADATA: json.dumps(document.metadata),
# Save as a list for efficient extraction as an Attribute
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
METADATA_SUFFIX: chunk.metadata_suffix_keyword,
EMBEDDINGS: embeddings_name_vector_map,
TITLE_EMBEDDING: chunk.title_embedding,
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
# the only `set` vespa has is `weightedset`, so we have to give each
# element an arbitrary weight
# rkuo: acl, docset and boost metadata are also updated through the metadata sync queue
# which only calls VespaIndex.update
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
BOOST: chunk.boost,
}
if multitenant:
if chunk.tenant_id:
vespa_document_fields[TENANT_ID] = chunk.tenant_id
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}"
logger.debug(f'Indexing to URL "{vespa_url}"')
res = http_client.post(
vespa_url, headers=json_header, json={"fields": vespa_document_fields}
)
try:
res.raise_for_status()
except Exception as e:
logger.exception(
f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
)
if isinstance(e, httpx.HTTPStatusError):
if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE:
logger.error(
"NOTE: HTTP Status 507 Insufficient Storage usually means "
"you need to allocate more memory or disk space to the "
"Vespa/index container."
)
raise e
def batch_index_vespa_chunks(
chunks: list[DocMetadataAwareIndexChunk],
index_name: str,
http_client: httpx.Client,
multitenant: bool,
executor: concurrent.futures.ThreadPoolExecutor | None = None,
) -> None:
external_executor = True
if not executor:
external_executor = False
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
try:
chunk_index_future = {
executor.submit(
_index_vespa_chunk, chunk, index_name, http_client, multitenant
): chunk
for chunk in chunks
}
for future in concurrent.futures.as_completed(chunk_index_future):
# Will raise exception if any indexing raised an exception
future.result()
finally:
if not external_executor:
executor.shutdown(wait=True)
def clean_chunk_id_copy(
chunk: DocMetadataAwareIndexChunk,
) -> DocMetadataAwareIndexChunk:
clean_chunk = chunk.copy(
update={
"source_document": chunk.source_document.copy(
update={
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
}
)
}
)
return clean_chunk

View File

@ -0,0 +1,71 @@
import re
from typing import cast
import httpx
from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
# See here for reference: https://docs.vespa.ai/en/documents.html
# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
# Define allowed ASCII characters
ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
ALLOWED_ASCII_CHARS[0x9] = True # tab
ALLOWED_ASCII_CHARS[0xA] = True # newline
ALLOWED_ASCII_CHARS[0xD] = True # carriage return
for i in range(0x20, 0x7F):
ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars
ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed
def is_text_character(codepoint: int) -> bool:
"""Returns whether the given codepoint is a valid text character."""
if codepoint < 0x80:
return ALLOWED_ASCII_CHARS[codepoint]
if codepoint < 0xD800:
return True
if codepoint <= 0xDFFF:
return False
if codepoint < 0xFDD0:
return True
if codepoint <= 0xFDEF:
return False
if codepoint >= 0x10FFFE:
return False
return (codepoint & 0xFFFF) < 0xFFFE
def replace_invalid_doc_id_characters(text: str) -> str:
"""Replaces invalid document ID characters in text."""
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
# and users only seem to be running into this error with single quotes
return text.replace("'", "_")
def remove_invalid_unicode_chars(text: str) -> str:
"""Vespa does not take in unicode chars that aren't valid for XML.
This removes them."""
_illegal_xml_chars_RE: re.Pattern = re.compile(
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
)
return _illegal_xml_chars_RE.sub("", text)
def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
"""
Configure and return an HTTP client for communicating with Vespa,
including authentication if needed.
"""
return httpx.Client(
cert=cast(tuple[str, str], (VESPA_CLOUD_CERT_PATH, VESPA_CLOUD_KEY_PATH))
if MANAGED_VESPA
else None,
verify=False if not MANAGED_VESPA else True,
timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
http2=True,
)

View File

@ -0,0 +1,100 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from onyx.configs.constants import INDEX_SEPARATOR
from onyx.context.search.models import IndexFilters
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
from onyx.document_index.vespa_constants import CHUNK_ID
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
from onyx.document_index.vespa_constants import DOCUMENT_ID
from onyx.document_index.vespa_constants import DOCUMENT_SETS
from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import METADATA_LIST
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.utils.logger import setup_logger
logger = setup_logger()
def build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> str:
def _build_or_filters(key: str, vals: list[str] | None) -> str:
if vals is None:
return ""
valid_vals = [val for val in vals if val]
if not key or not valid_vals:
return ""
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
or_clause = " or ".join(eq_elems)
return f"({or_clause}) and "
def _build_time_filter(
cutoff: datetime | None,
# Slightly over 3 Months, approximately 1 fiscal quarter
untimed_doc_cutoff: timedelta = timedelta(days=92),
) -> str:
if not cutoff:
return ""
# For Documents that don't have an updated at, filter them out for queries asking for
# very recent documents (3 months) default. Documents that don't have an updated at
# time are assigned 3 months for time decay value
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
cutoff_secs = int(cutoff.timestamp())
if include_untimed:
# Documents without updated_at are assigned -1 as their date
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
if filters.tenant_id:
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
if filters.access_control_list is not None:
filter_str += _build_or_filters(
ACCESS_CONTROL_LIST, filters.access_control_list
)
source_strs = (
[s.value for s in filters.source_type] if filters.source_type else None
)
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
tag_attributes = None
tags = filters.tags
if tags:
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
filter_str += _build_time_filter(filters.time_cutoff)
return filter_str
def build_vespa_id_based_retrieval_yql(
chunk_request: VespaChunkRequest,
) -> str:
id_based_retrieval_yql_section = (
f'({DOCUMENT_ID} contains "{chunk_request.document_id}"'
)
if chunk_request.is_capped:
id_based_retrieval_yql_section += (
f" and {CHUNK_ID} >= {chunk_request.min_chunk_ind or 0}"
)
id_based_retrieval_yql_section += (
f" and {CHUNK_ID} <= {chunk_request.max_chunk_ind}"
)
id_based_retrieval_yql_section += ")"
return id_based_retrieval_yql_section

View File

@ -0,0 +1,104 @@
from onyx.configs.app_configs import VESPA_CLOUD_URL
from onyx.configs.app_configs import VESPA_CONFIG_SERVER_HOST
from onyx.configs.app_configs import VESPA_HOST
from onyx.configs.app_configs import VESPA_PORT
from onyx.configs.app_configs import VESPA_TENANT_PORT
from onyx.configs.constants import SOURCE_TYPE
VESPA_DIM_REPLACEMENT_PAT = "VARIABLE_DIM"
DANSWER_CHUNK_REPLACEMENT_PAT = "DANSWER_CHUNK_NAME"
DOCUMENT_REPLACEMENT_PAT = "DOCUMENT_REPLACEMENT"
SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER"
DATE_REPLACEMENT = "DATE_REPLACEMENT"
SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER"
TENANT_ID_PAT = "TENANT_ID_REPLACEMENT"
TENANT_ID_REPLACEMENT = """field tenant_id type string {
indexing: summary | attribute
rank: filter
attribute: fast-search
}"""
# config server
VESPA_CONFIG_SERVER_URL = (
VESPA_CLOUD_URL or f"http://{VESPA_CONFIG_SERVER_HOST}:{VESPA_TENANT_PORT}"
)
VESPA_APPLICATION_ENDPOINT = f"{VESPA_CONFIG_SERVER_URL}/application/v2"
# main search application
VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}"
# danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd
DOCUMENT_ID_ENDPOINT = (
f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"
)
SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"
NUM_THREADS = (
32 # since Vespa doesn't allow batching of inserts / updates, we use threads
)
MAX_ID_SEARCH_QUERY_SIZE = 400
# Suspect that adding too many "or" conditions will cause Vespa to timeout and return
# an empty list of hits (with no error status and coverage: 0 and degraded)
MAX_OR_CONDITIONS = 10
# up from 500ms for now, since we've seen quite a few timeouts
# in the long term, we are looking to improve the performance of Vespa
# so that we can bring this back to default
VESPA_TIMEOUT = "3s"
BATCH_SIZE = 128 # Specific to Vespa
TENANT_ID = "tenant_id"
DOCUMENT_ID = "document_id"
CHUNK_ID = "chunk_id"
BLURB = "blurb"
CONTENT = "content"
SOURCE_LINKS = "source_links"
SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SKIP_TITLE_EMBEDDING = "skip_title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"
ACCESS_CONTROL_LIST = "access_control_list"
DOCUMENT_SETS = "document_sets"
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
METADATA = "metadata"
METADATA_LIST = "metadata_list"
METADATA_SUFFIX = "metadata_suffix"
BOOST = "boost"
DOC_UPDATED_AT = "doc_updated_at" # Indexed as seconds since epoch
PRIMARY_OWNERS = "primary_owners"
SECONDARY_OWNERS = "secondary_owners"
RECENCY_BIAS = "recency_bias"
HIDDEN = "hidden"
# Specific to Vespa, needed for highlighting matching keywords / section
CONTENT_SUMMARY = "content_summary"
YQL_BASE = (
f"select "
f"documentid, "
f"{DOCUMENT_ID}, "
f"{CHUNK_ID}, "
f"{BLURB}, "
f"{CONTENT}, "
f"{SOURCE_TYPE}, "
f"{SOURCE_LINKS}, "
f"{SEMANTIC_IDENTIFIER}, "
f"{TITLE}, "
f"{SECTION_CONTINUATION}, "
f"{BOOST}, "
f"{HIDDEN}, "
f"{DOC_UPDATED_AT}, "
f"{PRIMARY_OWNERS}, "
f"{SECONDARY_OWNERS}, "
f"{LARGE_CHUNK_REFERENCE_IDS}, "
f"{METADATA}, "
f"{METADATA_SUFFIX}, "
f"{CONTENT_SUMMARY} "
f"from {{index_name}} where "
)