Fully remove visit API (#3621)

* v1

* update indexing logic

* update updates

* nit

* clean up args

* update for clarity + best practices

* nit + logs

* fix

* minor clean up

* remove logs

* quick nit
This commit is contained in:
pablonyx
2025-01-08 13:49:01 -08:00
committed by GitHub
parent eac73a1bf1
commit d7bc32c0ec
15 changed files with 397 additions and 254 deletions

View File

@ -28,13 +28,35 @@ class RetryDocumentIndex:
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
stop=stop_after_delay(STOP_AFTER), stop=stop_after_delay(STOP_AFTER),
) )
def delete_single(self, doc_id: str) -> int: def delete_single(
return self.index.delete_single(doc_id) self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
return self.index.delete_single(
doc_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
)
@retry( @retry(
retry=retry_if_exception_type(httpx.ReadTimeout), retry=retry_if_exception_type(httpx.ReadTimeout),
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
stop=stop_after_delay(STOP_AFTER), stop=stop_after_delay(STOP_AFTER),
) )
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: def update_single(
return self.index.update_single(doc_id, fields) self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
return self.index.update_single(
doc_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
fields=fields,
)

View File

@ -12,6 +12,7 @@ from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocument
from onyx.configs.constants import OnyxCeleryTask from onyx.configs.constants import OnyxCeleryTask
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
from onyx.db.document import delete_documents_complete__no_commit from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.document import fetch_chunk_count_for_document
from onyx.db.document import get_document from onyx.db.document import get_document
from onyx.db.document import get_document_connector_count from onyx.db.document import get_document_connector_count
from onyx.db.document import mark_document_as_modified from onyx.db.document import mark_document_as_modified
@ -80,7 +81,13 @@ def document_by_cc_pair_cleanup_task(
# delete it from vespa and the db # delete it from vespa and the db
action = "delete" action = "delete"
chunks_affected = retry_index.delete_single(document_id) chunk_count = fetch_chunk_count_for_document(document_id, db_session)
chunks_affected = retry_index.delete_single(
document_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
)
delete_documents_complete__no_commit( delete_documents_complete__no_commit(
db_session=db_session, db_session=db_session,
document_ids=[document_id], document_ids=[document_id],
@ -110,7 +117,12 @@ def document_by_cc_pair_cleanup_task(
) )
# update Vespa. OK if doc doesn't exist. Raises exception otherwise. # update Vespa. OK if doc doesn't exist. Raises exception otherwise.
chunks_affected = retry_index.update_single(document_id, fields=fields) chunks_affected = retry_index.update_single(
document_id,
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
)
# there are still other cc_pair references to the doc, so just resync to Vespa # there are still other cc_pair references to the doc, so just resync to Vespa
delete_document_by_connector_credential_pair__no_commit( delete_document_by_connector_credential_pair__no_commit(

View File

@ -992,7 +992,12 @@ def vespa_metadata_sync_task(
) )
# update Vespa. OK if doc doesn't exist. Raises exception otherwise. # update Vespa. OK if doc doesn't exist. Raises exception otherwise.
chunks_affected = retry_index.update_single(document_id, fields) chunks_affected = retry_index.update_single(
document_id,
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
)
# update db last. Worst case = we crash right before this and # update db last. Worst case = we crash right before this and
# the sync might repeat again later # the sync might repeat again later

View File

@ -685,20 +685,27 @@ def get_document_sources(
def fetch_chunk_counts_for_documents( def fetch_chunk_counts_for_documents(
document_ids: list[str], document_ids: list[str],
db_session: Session, db_session: Session,
) -> list[tuple[str, int | None]]: ) -> list[tuple[str, int]]:
""" """
Return a list of (document_id, chunk_count) tuples. Return a list of (document_id, chunk_count) tuples.
Note: chunk_count might be None if not set in DB, If a document_id is not found in the database, it will be returned with a chunk_count of 0.
so we declare it as Optional[int].
""" """
stmt = select(DbDocument.id, DbDocument.chunk_count).where( stmt = select(DbDocument.id, DbDocument.chunk_count).where(
DbDocument.id.in_(document_ids) DbDocument.id.in_(document_ids)
) )
# results is a list of 'Row' objects, each containing two columns
results = db_session.execute(stmt).all() results = db_session.execute(stmt).all()
# If DbDocument.id is guaranteed to be a string, you can just do row.id; # Create a dictionary of document_id to chunk_count
# otherwise cast to str if you need to be sure it's a string: chunk_counts = {str(row.id): row.chunk_count or 0 for row in results}
return [(str(row[0]), row[1]) for row in results]
# or row.id, row.chunk_count if they are named attributes in your ORM model # Return a list of tuples, using 0 for documents not found in the database
return [(doc_id, chunk_counts.get(doc_id, 0)) for doc_id in document_ids]
def fetch_chunk_count_for_document(
document_id: str,
db_session: Session,
) -> int | None:
stmt = select(DbDocument.chunk_count).where(DbDocument.id == document_id)
return db_session.execute(stmt).scalar_one_or_none()

View File

@ -8,7 +8,7 @@ from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.indexing.models import DocMetadataAwareIndexChunk
from shared_configs.configs import MULTI_TENANT
DEFAULT_BATCH_SIZE = 30 DEFAULT_BATCH_SIZE = 30
DEFAULT_INDEX_NAME = "danswer_chunk" DEFAULT_INDEX_NAME = "danswer_chunk"
@ -37,7 +37,10 @@ def translate_boost_count_to_multiplier(boost: int) -> float:
return 2 / (1 + math.exp(-1 * boost / 3)) return 2 / (1 + math.exp(-1 * boost / 3))
def assemble_document_chunk_info( # Assembles a list of Vespa chunk IDs for a document
# given the required context. This can be used to directly query
# Vespa's Document API.
def get_document_chunk_ids(
enriched_document_info_list: list[EnrichedDocumentIndexingInfo], enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
tenant_id: str | None, tenant_id: str | None,
large_chunks_enabled: bool, large_chunks_enabled: bool,
@ -110,10 +113,11 @@ def get_uuid_from_chunk_info(
"large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id) "large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
) )
unique_identifier_string = "_".join([doc_str, chunk_index]) unique_identifier_string = "_".join([doc_str, chunk_index])
if tenant_id: if tenant_id and MULTI_TENANT:
unique_identifier_string += "_" + tenant_id unique_identifier_string += "_" + tenant_id
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string) uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
return uuid_value
def get_uuid_from_chunk_info_old( def get_uuid_from_chunk_info_old(

View File

@ -109,7 +109,7 @@ class UpdateRequest:
Does not update any of the None fields Does not update any of the None fields
""" """
document_ids: list[str] minimal_document_indexing_info: list[MinimalDocumentIndexingInfo]
# all other fields except these 4 will always be left alone by the update request # all other fields except these 4 will always be left alone by the update request
access: DocumentAccess | None = None access: DocumentAccess | None = None
document_sets: set[str] | None = None document_sets: set[str] | None = None
@ -136,7 +136,7 @@ class Verifiable(abc.ABC):
index_name: str, index_name: str,
secondary_index_name: str | None, secondary_index_name: str | None,
*args: Any, *args: Any,
**kwargs: Any **kwargs: Any,
) -> None: ) -> None:
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.index_name = index_name self.index_name = index_name
@ -218,7 +218,13 @@ class Deletable(abc.ABC):
""" """
@abc.abstractmethod @abc.abstractmethod
def delete_single(self, doc_id: str) -> int: def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
""" """
Given a single document id, hard delete it from the document index Given a single document id, hard delete it from the document index
@ -239,7 +245,14 @@ class Updatable(abc.ABC):
""" """
@abc.abstractmethod @abc.abstractmethod
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: def update_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
""" """
Updates all chunks for a document with the specified fields. Updates all chunks for a document with the specified fields.
None values mean that the field does not need an update. None values mean that the field does not need an update.
@ -257,7 +270,9 @@ class Updatable(abc.ABC):
raise NotImplementedError raise NotImplementedError
@abc.abstractmethod @abc.abstractmethod
def update(self, update_requests: list[UpdateRequest]) -> None: def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
""" """
Updates some set of chunks. The document and fields to update are specified in the update Updates some set of chunks. The document and fields to update are specified in the update
requests. Each update request in the list applies its changes to a list of document ids. requests. Each update request in the list applies its changes to a list of document ids.

View File

@ -13,11 +13,11 @@ from datetime import timedelta
from typing import BinaryIO from typing import BinaryIO
from typing import cast from typing import cast
from typing import List from typing import List
from uuid import UUID
import httpx # type: ignore import httpx # type: ignore
import requests # type: ignore import requests # type: ignore
from onyx.configs.app_configs import DOCUMENT_INDEX_NAME
from onyx.configs.chat_configs import DOC_TIME_DECAY from onyx.configs.chat_configs import DOC_TIME_DECAY
from onyx.configs.chat_configs import NUM_RETURNED_HITS from onyx.configs.chat_configs import NUM_RETURNED_HITS
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@ -25,7 +25,8 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
from onyx.configs.constants import KV_REINDEX_KEY from onyx.configs.constants import KV_REINDEX_KEY
from onyx.context.search.models import IndexFilters from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned from onyx.context.search.models import InferenceChunkUncleaned
from onyx.document_index.document_index_utils import assemble_document_chunk_info from onyx.db.engine import get_session_with_tenant
from onyx.document_index.document_index_utils import get_document_chunk_ids
from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentInsertionRecord from onyx.document_index.interfaces import DocumentInsertionRecord
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
@ -35,9 +36,6 @@ from onyx.document_index.interfaces import UpdateRequest
from onyx.document_index.interfaces import VespaChunkRequest from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.interfaces import VespaDocumentFields from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
from onyx.document_index.vespa.chunk_retrieval import (
get_all_vespa_ids_for_document_id,
)
from onyx.document_index.vespa.chunk_retrieval import ( from onyx.document_index.vespa.chunk_retrieval import (
parallel_visit_api_retrieval, parallel_visit_api_retrieval,
) )
@ -46,6 +44,9 @@ from onyx.document_index.vespa.deletion import delete_vespa_chunks
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.utils import ( from onyx.document_index.vespa.shared_utils.utils import (
replace_invalid_doc_id_characters, replace_invalid_doc_id_characters,
@ -337,40 +338,25 @@ class VespaIndex(DocumentIndex):
# documents that have `chunk_count` in the database, but not for # documents that have `chunk_count` in the database, but not for
# `old_version` documents. # `old_version` documents.
enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [] enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [
for document_id, doc_count in doc_id_to_previous_chunk_cnt.items(): VespaIndex.enrich_basic_chunk_info(
last_indexed_chunk = doc_id_to_previous_chunk_cnt.get(document_id, None) index_name=self.index_name,
# If the document has no `chunk_count` in the database, we know that it http_client=http_client,
# has the old chunk ID system and we must check for the final chunk index document_id=doc_id,
is_old_version = False previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id, 0),
if last_indexed_chunk is None: new_chunk_count=doc_id_to_new_chunk_cnt.get(doc_id, 0),
is_old_version = True
minimal_doc_info = MinimalDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
)
last_indexed_chunk = check_for_final_chunk_existence(
minimal_doc_info=minimal_doc_info,
start_index=doc_id_to_new_chunk_cnt[document_id],
index_name=self.index_name,
http_client=http_client,
)
# If the document has previously indexed chunks, we know it previously existed
if doc_count or last_indexed_chunk:
existing_docs.add(document_id)
enriched_doc_info = EnrichedDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
chunk_end_index=last_indexed_chunk,
old_version=is_old_version,
) )
enriched_doc_infos.append(enriched_doc_info) for doc_id in doc_id_to_new_chunk_cnt.keys()
]
for cleaned_doc_info in enriched_doc_infos:
# If the document has previously indexed chunks, we know it previously existed
if cleaned_doc_info.chunk_end_index:
existing_docs.add(cleaned_doc_info.doc_id)
# Now, for each doc, we know exactly where to start and end our deletion # Now, for each doc, we know exactly where to start and end our deletion
# So let's generate the chunk IDs for each chunk to delete # So let's generate the chunk IDs for each chunk to delete
chunks_to_delete = assemble_document_chunk_info( chunks_to_delete = get_document_chunk_ids(
enriched_document_info_list=enriched_doc_infos, enriched_document_info_list=enriched_doc_infos,
tenant_id=tenant_id, tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled, large_chunks_enabled=large_chunks_enabled,
@ -447,21 +433,21 @@ class VespaIndex(DocumentIndex):
failure_msg = f"Failed to update document: {future_to_document_id[future]}" failure_msg = f"Failed to update document: {future_to_document_id[future]}"
raise requests.HTTPError(failure_msg) from e raise requests.HTTPError(failure_msg) from e
def update(self, update_requests: list[UpdateRequest]) -> None: def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
logger.debug(f"Updating {len(update_requests)} documents in Vespa") logger.debug(f"Updating {len(update_requests)} documents in Vespa")
# Handle Vespa character limitations # Handle Vespa character limitations
# Mutating update_requests but it's not used later anyway # Mutating update_requests but it's not used later anyway
for update_request in update_requests: for update_request in update_requests:
update_request.document_ids = [ for doc_info in update_request.minimal_document_indexing_info:
replace_invalid_doc_id_characters(doc_id) doc_info.doc_id = replace_invalid_doc_id_characters(doc_info.doc_id)
for doc_id in update_request.document_ids
]
update_start = time.monotonic() update_start = time.monotonic()
processed_updates_requests: list[_VespaUpdateRequest] = [] processed_updates_requests: list[_VespaUpdateRequest] = []
all_doc_chunk_ids: dict[str, list[str]] = {} all_doc_chunk_ids: dict[str, list[UUID]] = {}
# Fetch all chunks for each document ahead of time # Fetch all chunks for each document ahead of time
index_names = [self.index_name] index_names = [self.index_name]
@ -469,30 +455,24 @@ class VespaIndex(DocumentIndex):
index_names.append(self.secondary_index_name) index_names.append(self.secondary_index_name)
chunk_id_start_time = time.monotonic() chunk_id_start_time = time.monotonic()
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: with get_vespa_http_client() as http_client:
future_to_doc_chunk_ids = { for update_request in update_requests:
executor.submit( for doc_info in update_request.minimal_document_indexing_info:
get_all_vespa_ids_for_document_id, for index_name in index_names:
document_id=document_id, doc_chunk_info = VespaIndex.enrich_basic_chunk_info(
index_name=index_name, index_name=index_name,
filters=None, http_client=http_client,
get_large_chunks=True, document_id=doc_info.doc_id,
): (document_id, index_name) previous_chunk_count=doc_info.chunk_start_index,
for index_name in index_names new_chunk_count=0,
for update_request in update_requests )
for document_id in update_request.document_ids doc_chunk_ids = get_document_chunk_ids(
} enriched_document_info_list=[doc_chunk_info],
for future in concurrent.futures.as_completed(future_to_doc_chunk_ids): tenant_id=tenant_id,
document_id, index_name = future_to_doc_chunk_ids[future] large_chunks_enabled=False,
try: )
doc_chunk_ids = future.result() all_doc_chunk_ids[doc_info.doc_id] = doc_chunk_ids
if document_id not in all_doc_chunk_ids:
all_doc_chunk_ids[document_id] = []
all_doc_chunk_ids[document_id].extend(doc_chunk_ids)
except Exception as e:
logger.error(
f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}"
)
logger.debug( logger.debug(
f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs" f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs"
) )
@ -521,11 +501,11 @@ class VespaIndex(DocumentIndex):
logger.error("Update request received but nothing to update") logger.error("Update request received but nothing to update")
continue continue
for document_id in update_request.document_ids: for doc_info in update_request.minimal_document_indexing_info:
for doc_chunk_id in all_doc_chunk_ids[document_id]: for doc_chunk_id in all_doc_chunk_ids[doc_info.doc_id]:
processed_updates_requests.append( processed_updates_requests.append(
_VespaUpdateRequest( _VespaUpdateRequest(
document_id=document_id, document_id=doc_info.doc_id,
url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}", url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}",
update_request=update_dict, update_request=update_dict,
) )
@ -537,36 +517,70 @@ class VespaIndex(DocumentIndex):
time.monotonic() - update_start, time.monotonic() - update_start,
) )
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: def update_single_chunk(
self,
doc_chunk_id: UUID,
index_name: str,
fields: VespaDocumentFields,
doc_id: str,
) -> None:
"""
Update a single "chunk" (document) in Vespa using its chunk ID.
"""
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
# WeightedSet<string> needs a map { item: weight, ... }
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
# Similar to above
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update.")
return
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
with get_vespa_http_client(http2=False) as http_client:
try:
resp = http_client.put(
vespa_url,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
logger.error(error_message)
raise
def update_single(
self,
doc_id: str,
*,
chunk_count: int | None,
tenant_id: str | None,
fields: VespaDocumentFields,
) -> int:
"""Note: if the document id does not exist, the update will be a no-op and the """Note: if the document id does not exist, the update will be a no-op and the
function will complete with no errors or exceptions. function will complete with no errors or exceptions.
Handle other exceptions if you wish to implement retry behavior Handle other exceptions if you wish to implement retry behavior
""" """
total_chunks_updated = 0 doc_chunk_count = 0
# Handle Vespa character limitations
# Mutating update_request but it's not used later anyway
normalized_doc_id = replace_invalid_doc_id_characters(doc_id)
# Build the _VespaUpdateRequest objects
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update")
return 0
index_names = [self.index_name] index_names = [self.index_name]
if self.secondary_index_name: if self.secondary_index_name:
@ -574,66 +588,51 @@ class VespaIndex(DocumentIndex):
with get_vespa_http_client(http2=False) as http_client: with get_vespa_http_client(http2=False) as http_client:
for index_name in index_names: for index_name in index_names:
params = httpx.QueryParams( with get_session_with_tenant(tenant_id=tenant_id) as db_session:
{ multipass_config = get_multipass_config(
"selection": f"{index_name}.document_id=='{normalized_doc_id}'", db_session=db_session,
"cluster": DOCUMENT_INDEX_NAME, primary_index=index_name == self.index_name,
} )
large_chunks_enabled = multipass_config.enable_large_chunks
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
document_id=doc_id,
previous_chunk_count=chunk_count,
new_chunk_count=0,
) )
while True: doc_chunk_ids = get_document_chunk_ids(
try: enriched_document_info_list=[enriched_doc_infos],
vespa_url = ( tenant_id=tenant_id,
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}" large_chunks_enabled=large_chunks_enabled,
)
logger.debug(f'update_single PUT on URL "{vespa_url}"')
resp = http_client.put(
vespa_url,
params=params,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to update chunks, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_updated = resp_data["documentCount"]
total_chunks_updated += chunks_updated
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.update_single: "
f"index={index_name} "
f"doc={normalized_doc_id} "
f"chunks_updated={total_chunks_updated}"
) )
return total_chunks_updated logger.info("UPDATING len(doc_chunk_ids)")
def delete_single(self, doc_id: str) -> int: doc_chunk_count += len(doc_chunk_ids)
"""Possibly faster overall than the delete method due to using a single
delete call with a selection query."""
for doc_chunk_id in doc_chunk_ids:
logger.info("UPDATING CHUNK")
self.update_single_chunk(
doc_chunk_id=doc_chunk_id,
index_name=index_name,
fields=fields,
doc_id=doc_id,
)
logger.info(f"UPDATED A TOTAL OF {doc_chunk_count} CHUNKS for {doc_id}")
return doc_chunk_count
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
total_chunks_deleted = 0 total_chunks_deleted = 0
# Vespa deletion is poorly documented ... luckily we found this
# https://docs.vespa.ai/en/operations/batch-delete.html#example
doc_id = replace_invalid_doc_id_characters(doc_id) doc_id = replace_invalid_doc_id_characters(doc_id)
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
@ -642,53 +641,41 @@ class VespaIndex(DocumentIndex):
if self.secondary_index_name: if self.secondary_index_name:
index_names.append(self.secondary_index_name) index_names.append(self.secondary_index_name)
with get_vespa_http_client(http2=False) as http_client: with get_vespa_http_client(
http2=False
) as http_client, concurrent.futures.ThreadPoolExecutor(
max_workers=NUM_THREADS
) as executor:
for index_name in index_names: for index_name in index_names:
params = httpx.QueryParams( with get_session_with_tenant(tenant_id=tenant_id) as db_session:
{ multipass_config = get_multipass_config(
"selection": f"{index_name}.document_id=='{doc_id}'", db_session=db_session,
"cluster": DOCUMENT_INDEX_NAME, primary_index=index_name == self.index_name,
} )
large_chunks_enabled = multipass_config.enable_large_chunks
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
document_id=doc_id,
previous_chunk_count=chunk_count,
new_chunk_count=0,
) )
chunks_to_delete = get_document_chunk_ids(
while True: enriched_document_info_list=[enriched_doc_infos],
try: tenant_id=tenant_id,
vespa_url = ( large_chunks_enabled=large_chunks_enabled,
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}"
)
logger.debug(f'delete_single DELETE on URL "{vespa_url}"')
resp = http_client.delete(
vespa_url,
params=params,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to delete chunk, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_deleted = resp_data["documentCount"]
total_chunks_deleted += chunks_deleted
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.delete_single: "
f"index={index_name} "
f"doc={doc_id} "
f"chunks_deleted={total_chunks_deleted}"
) )
for doc_chunk_ids_batch in batch_generator(
chunks_to_delete, BATCH_SIZE
):
total_chunks_deleted += len(doc_chunk_ids_batch)
delete_vespa_chunks(
doc_chunk_ids=doc_chunk_ids_batch,
index_name=index_name,
http_client=http_client,
executor=executor,
)
return total_chunks_deleted return total_chunks_deleted
@ -787,8 +774,51 @@ class VespaIndex(DocumentIndex):
return query_vespa(params) return query_vespa(params)
# Retrieves chunk information for a document:
# - Determines the last indexed chunk
# - Identifies if the document uses the old or new chunk ID system
# This data is crucial for Vespa document updates without relying on the visit API.
@classmethod @classmethod
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None: def enrich_basic_chunk_info(
cls,
index_name: str,
http_client: httpx.Client,
document_id: str,
previous_chunk_count: int | None = None,
new_chunk_count: int = 0,
) -> EnrichedDocumentIndexingInfo:
last_indexed_chunk = previous_chunk_count
# If the document has no `chunk_count` in the database, we know that it
# has the old chunk ID system and we must check for the final chunk index
is_old_version = False
if last_indexed_chunk is None:
is_old_version = True
minimal_doc_info = MinimalDocumentIndexingInfo(
doc_id=document_id, chunk_start_index=new_chunk_count
)
last_indexed_chunk = check_for_final_chunk_existence(
minimal_doc_info=minimal_doc_info,
start_index=new_chunk_count,
index_name=index_name,
http_client=http_client,
)
enriched_doc_info = EnrichedDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=new_chunk_count,
chunk_end_index=last_indexed_chunk,
old_version=is_old_version,
)
return enriched_doc_info
@classmethod
def delete_entries_by_tenant_id(
cls,
*,
tenant_id: str,
index_name: str,
) -> None:
""" """
Deletes all entries in the specified index with the given tenant_id. Deletes all entries in the specified index with the given tenant_id.

View File

@ -7,10 +7,15 @@ from http import HTTPStatus
import httpx import httpx
from retry import retry from retry import retry
from sqlalchemy.orm import Session
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.connectors.cross_connector_utils.miscellaneous_utils import ( from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations, get_experts_stores_representations,
) )
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.document_index_utils import get_uuid_from_chunk from onyx.document_index.document_index_utils import get_uuid_from_chunk
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
from onyx.document_index.interfaces import MinimalDocumentIndexingInfo from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
@ -45,6 +50,8 @@ from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import EmbeddingProvider
from onyx.indexing.models import MultipassConfig
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
@ -129,7 +136,9 @@ def _index_vespa_chunk(
document = chunk.source_document document = chunk.source_document
# No minichunk documents in vespa, minichunk vectors are stored in the chunk itself # No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
vespa_chunk_id = str(get_uuid_from_chunk(chunk)) vespa_chunk_id = str(get_uuid_from_chunk(chunk))
embeddings = chunk.embeddings embeddings = chunk.embeddings
embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding} embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
@ -263,5 +272,49 @@ def check_for_final_chunk_existence(
) )
if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client): if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client):
return index return index
index += 1 index += 1
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
"""
Determines whether multipass should be used based on the search settings
or the default config if settings are unavailable.
"""
if search_settings is not None:
return search_settings.multipass_indexing
return ENABLE_MULTIPASS_INDEXING
def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool:
"""
Given multipass usage and an embedder, decides whether large chunks are allowed
based on model/provider constraints.
"""
# Only local models that support a larger context are from Nomic
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
return (
multipass
and search_settings.model_name.startswith("nomic-ai")
and search_settings.provider_type != EmbeddingProvider.COHERE
)
def get_multipass_config(
db_session: Session, primary_index: bool = True
) -> MultipassConfig:
"""
Determines whether to enable multipass and large chunks by examining
the current search settings and the embedder configuration.
"""
search_settings = (
get_current_search_settings(db_session)
if primary_index
else get_secondary_search_settings(db_session)
)
multipass = should_use_multipass(search_settings)
if not search_settings:
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
enable_large_chunks = can_use_large_chunks(multipass, search_settings)
return MultipassConfig(
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
)

View File

@ -11,7 +11,6 @@ from sqlalchemy.orm import Session
from onyx.access.access import get_access_for_documents from onyx.access.access import get_access_for_documents
from onyx.access.models import DocumentAccess from onyx.access.models import DocumentAccess
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
from onyx.configs.constants import DEFAULT_BOOST from onyx.configs.constants import DEFAULT_BOOST
@ -31,12 +30,14 @@ from onyx.db.document import upsert_documents
from onyx.db.document_set import fetch_document_sets_for_documents from onyx.db.document_set import fetch_document_sets_for_documents
from onyx.db.index_attempt import create_index_attempt_error from onyx.db.index_attempt import create_index_attempt_error
from onyx.db.models import Document as DBDocument from onyx.db.models import Document as DBDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag from onyx.db.tag import create_or_add_document_tag
from onyx.db.tag import create_or_add_document_tag_list from onyx.db.tag import create_or_add_document_tag_list
from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentMetadata from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams from onyx.document_index.interfaces import IndexBatchParams
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.indexing.chunker import Chunker from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import IndexingEmbedder from onyx.indexing.embedder import IndexingEmbedder
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
@ -44,7 +45,6 @@ from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
from onyx.utils.timing import log_function_time from onyx.utils.timing import log_function_time
from shared_configs.enums import EmbeddingProvider
logger = setup_logger() logger = setup_logger()
@ -479,28 +479,6 @@ def index_doc_batch(
return result return result
def check_enable_large_chunks_and_multipass(
embedder: IndexingEmbedder, db_session: Session
) -> tuple[bool, bool]:
search_settings = get_current_search_settings(db_session)
multipass = (
search_settings.multipass_indexing
if search_settings
else ENABLE_MULTIPASS_INDEXING
)
enable_large_chunks = (
multipass
and
# Only local models that supports larger context are from Nomic
(embedder.model_name.startswith("nomic-ai"))
and
# Cohere does not support larger context they recommend not going above 512 tokens
embedder.provider_type != EmbeddingProvider.COHERE
)
return multipass, enable_large_chunks
def build_indexing_pipeline( def build_indexing_pipeline(
*, *,
embedder: IndexingEmbedder, embedder: IndexingEmbedder,
@ -513,14 +491,12 @@ def build_indexing_pipeline(
callback: IndexingHeartbeatInterface | None = None, callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol: ) -> IndexingPipelineProtocol:
"""Builds a pipeline which takes in a list (batch) of docs and indexes them.""" """Builds a pipeline which takes in a list (batch) of docs and indexes them."""
multipass, enable_large_chunks = check_enable_large_chunks_and_multipass( multipass_config = get_multipass_config(db_session, primary_index=True)
embedder, db_session
)
chunker = chunker or Chunker( chunker = chunker or Chunker(
tokenizer=embedder.embedding_model.tokenizer, tokenizer=embedder.embedding_model.tokenizer,
enable_multipass=multipass, enable_multipass=multipass_config.multipass_indexing,
enable_large_chunks=enable_large_chunks, enable_large_chunks=multipass_config.enable_large_chunks,
# after every doc, update status in case there are a bunch of really long docs # after every doc, update status in case there are a bunch of really long docs
callback=callback, callback=callback,
) )

View File

@ -154,3 +154,8 @@ class IndexingSetting(EmbeddingModelDetail):
index_name=search_settings.index_name, index_name=search_settings.index_name,
multipass_indexing=search_settings.multipass_indexing, multipass_indexing=search_settings.multipass_indexing,
) )
class MultipassConfig(BaseModel):
multipass_indexing: bool
enable_large_chunks: bool

View File

@ -5,6 +5,7 @@ import sys
from sqlalchemy import delete from sqlalchemy import delete
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus from onyx.db.enums import ConnectorCredentialPairStatus
# Modify sys.path # Modify sys.path
@ -38,7 +39,6 @@ from onyx.db.engine import get_session_context_manager
from onyx.document_index.factory import get_default_document_index from onyx.document_index.factory import get_default_document_index
from onyx.file_store.file_store import get_default_file_store from onyx.file_store.file_store import get_default_file_store
from onyx.document_index.document_index_utils import get_both_index_names from onyx.document_index.document_index_utils import get_both_index_names
from onyx.db.document import delete_documents_complete__no_commit
# pylint: enable=E402 # pylint: enable=E402
# flake8: noqa: E402 # flake8: noqa: E402
@ -71,13 +71,16 @@ def _unsafe_deletion(
if not documents: if not documents:
break break
document_ids = [document.id for document in documents] for document in documents:
for doc_id in document_ids: document_index.delete_single(
document_index.delete_single(doc_id) doc_id=document.id,
tenant_id=None,
chunk_count=document.chunk_count,
)
delete_documents_complete__no_commit( delete_documents_complete__no_commit(
db_session=db_session, db_session=db_session,
document_ids=document_ids, document_ids=[document.id for document in documents],
) )
num_docs_deleted += len(documents) num_docs_deleted += len(documents)
@ -216,6 +219,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"connector_id", type=int, help="The ID of the connector to delete" "connector_id", type=int, help="The ID of the connector to delete"
) )
args = parser.parse_args() args = parser.parse_args()
with get_session_context_manager() as db_session: with get_session_context_manager() as db_session:
_delete_connector(args.connector_id, db_session) _delete_connector(args.connector_id, db_session)

View File

@ -15,6 +15,7 @@ from onyx.db.engine import get_session_context_manager # noqa: E402
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402 from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
from onyx.db.search_settings import get_current_search_settings # noqa: E402 from onyx.db.search_settings import get_current_search_settings # noqa: E402
from onyx.document_index.vespa.index import VespaIndex # noqa: E402 from onyx.document_index.vespa.index import VespaIndex # noqa: E402
from onyx.db.document import get_document # noqa: E402
BATCH_SIZE = 100 BATCH_SIZE = 100
@ -63,6 +64,9 @@ def main() -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
def process_doc(doc_id: str) -> str | None: def process_doc(doc_id: str) -> str | None:
document = get_document(doc_id, db_session)
if not document:
return None
# Check if document exists in Vespa first # Check if document exists in Vespa first
try: try:
chunks = vespa_index.id_based_retrieval( chunks = vespa_index.id_based_retrieval(
@ -83,7 +87,9 @@ def main() -> None:
try: try:
print(f"Deleting document {doc_id} in Vespa") print(f"Deleting document {doc_id} in Vespa")
chunks_deleted = vespa_index.delete_single(doc_id) chunks_deleted = vespa_index.delete_single(
doc_id, tenant_id=None, chunk_count=document.chunk_count
)
if chunks_deleted > 0: if chunks_deleted > 0:
print( print(
f"Deleted {chunks_deleted} chunks for document {doc_id}" f"Deleted {chunks_deleted} chunks for document {doc_id}"

View File

View File

View File

@ -21,8 +21,9 @@ def _verify_document_permissions(
group_names: list[str] | None = None, group_names: list[str] | None = None,
doc_creating_user: DATestUser | None = None, doc_creating_user: DATestUser | None = None,
) -> None: ) -> None:
acl_keys = set(retrieved_doc["access_control_list"].keys()) acl_keys = set(retrieved_doc.get("access_control_list", {}).keys())
print(f"ACL keys: {acl_keys}") print(f"ACL keys: {acl_keys}")
if cc_pair.access_type == AccessType.PUBLIC: if cc_pair.access_type == AccessType.PUBLIC:
if "PUBLIC" not in acl_keys: if "PUBLIC" not in acl_keys:
raise ValueError( raise ValueError(
@ -42,8 +43,9 @@ def _verify_document_permissions(
found_group_keys = {key for key in acl_keys if key.startswith("group:")} found_group_keys = {key for key in acl_keys if key.startswith("group:")}
if found_group_keys != expected_group_keys: if found_group_keys != expected_group_keys:
raise ValueError( raise ValueError(
f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. Found: {found_group_keys}, \n" f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. "
f"Expected: {expected_group_keys}" f"Expected: {expected_group_keys} Found: {found_group_keys}\n"
f"All ACL keys: {acl_keys}"
) )
if doc_set_names is not None: if doc_set_names is not None:
@ -153,9 +155,11 @@ class DocumentManager:
) -> None: ) -> None:
doc_ids = [document.id for document in cc_pair.documents] doc_ids = [document.id for document in cc_pair.documents]
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
retrieved_docs = { retrieved_docs = {
doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict
} }
# Left this here for debugging purposes. # Left this here for debugging purposes.
# import json # import json
# for doc in retrieved_docs.values(): # for doc in retrieved_docs.values():