Fully remove visit API (#3621)

* v1

* update indexing logic

* update updates

* nit

* clean up args

* update for clarity + best practices

* nit + logs

* fix

* minor clean up

* remove logs

* quick nit
This commit is contained in:
pablonyx 2025-01-08 13:49:01 -08:00 committed by GitHub
parent eac73a1bf1
commit d7bc32c0ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 397 additions and 254 deletions

View File

@ -28,13 +28,35 @@ class RetryDocumentIndex:
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
stop=stop_after_delay(STOP_AFTER),
)
def delete_single(self, doc_id: str) -> int:
return self.index.delete_single(doc_id)
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
return self.index.delete_single(
doc_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
)
@retry(
retry=retry_if_exception_type(httpx.ReadTimeout),
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
stop=stop_after_delay(STOP_AFTER),
)
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
return self.index.update_single(doc_id, fields)
def update_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
return self.index.update_single(
doc_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
fields=fields,
)

View File

@ -12,6 +12,7 @@ from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocument
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.document import fetch_chunk_count_for_document
from onyx.db.document import get_document
from onyx.db.document import get_document_connector_count
from onyx.db.document import mark_document_as_modified
@ -80,7 +81,13 @@ def document_by_cc_pair_cleanup_task(
# delete it from vespa and the db
action = "delete"
chunks_affected = retry_index.delete_single(document_id)
chunk_count = fetch_chunk_count_for_document(document_id, db_session)
chunks_affected = retry_index.delete_single(
document_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=[document_id],
@ -110,7 +117,12 @@ def document_by_cc_pair_cleanup_task(
)
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
chunks_affected = retry_index.update_single(document_id, fields=fields)
chunks_affected = retry_index.update_single(
document_id,
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
)
# there are still other cc_pair references to the doc, so just resync to Vespa
delete_document_by_connector_credential_pair__no_commit(

View File

@ -992,7 +992,12 @@ def vespa_metadata_sync_task(
)
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
chunks_affected = retry_index.update_single(document_id, fields)
chunks_affected = retry_index.update_single(
document_id,
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
)
# update db last. Worst case = we crash right before this and
# the sync might repeat again later

View File

@ -685,20 +685,27 @@ def get_document_sources(
def fetch_chunk_counts_for_documents(
document_ids: list[str],
db_session: Session,
) -> list[tuple[str, int | None]]:
) -> list[tuple[str, int]]:
"""
Return a list of (document_id, chunk_count) tuples.
Note: chunk_count might be None if not set in DB,
so we declare it as Optional[int].
If a document_id is not found in the database, it will be returned with a chunk_count of 0.
"""
stmt = select(DbDocument.id, DbDocument.chunk_count).where(
DbDocument.id.in_(document_ids)
)
# results is a list of 'Row' objects, each containing two columns
results = db_session.execute(stmt).all()
# If DbDocument.id is guaranteed to be a string, you can just do row.id;
# otherwise cast to str if you need to be sure it's a string:
return [(str(row[0]), row[1]) for row in results]
# or row.id, row.chunk_count if they are named attributes in your ORM model
# Create a dictionary of document_id to chunk_count
chunk_counts = {str(row.id): row.chunk_count or 0 for row in results}
# Return a list of tuples, using 0 for documents not found in the database
return [(doc_id, chunk_counts.get(doc_id, 0)) for doc_id in document_ids]
def fetch_chunk_count_for_document(
document_id: str,
db_session: Session,
) -> int | None:
stmt = select(DbDocument.chunk_count).where(DbDocument.id == document_id)
return db_session.execute(stmt).scalar_one_or_none()

View File

@ -8,7 +8,7 @@ from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
from onyx.indexing.models import DocMetadataAwareIndexChunk
from shared_configs.configs import MULTI_TENANT
DEFAULT_BATCH_SIZE = 30
DEFAULT_INDEX_NAME = "danswer_chunk"
@ -37,7 +37,10 @@ def translate_boost_count_to_multiplier(boost: int) -> float:
return 2 / (1 + math.exp(-1 * boost / 3))
def assemble_document_chunk_info(
# Assembles a list of Vespa chunk IDs for a document
# given the required context. This can be used to directly query
# Vespa's Document API.
def get_document_chunk_ids(
enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
tenant_id: str | None,
large_chunks_enabled: bool,
@ -110,10 +113,11 @@ def get_uuid_from_chunk_info(
"large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
)
unique_identifier_string = "_".join([doc_str, chunk_index])
if tenant_id:
if tenant_id and MULTI_TENANT:
unique_identifier_string += "_" + tenant_id
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
return uuid_value
def get_uuid_from_chunk_info_old(

View File

@ -109,7 +109,7 @@ class UpdateRequest:
Does not update any of the None fields
"""
document_ids: list[str]
minimal_document_indexing_info: list[MinimalDocumentIndexingInfo]
# all other fields except these 4 will always be left alone by the update request
access: DocumentAccess | None = None
document_sets: set[str] | None = None
@ -136,7 +136,7 @@ class Verifiable(abc.ABC):
index_name: str,
secondary_index_name: str | None,
*args: Any,
**kwargs: Any
**kwargs: Any,
) -> None:
super().__init__(*args, **kwargs)
self.index_name = index_name
@ -218,7 +218,13 @@ class Deletable(abc.ABC):
"""
@abc.abstractmethod
def delete_single(self, doc_id: str) -> int:
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
"""
Given a single document id, hard delete it from the document index
@ -239,7 +245,14 @@ class Updatable(abc.ABC):
"""
@abc.abstractmethod
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
def update_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
"""
Updates all chunks for a document with the specified fields.
None values mean that the field does not need an update.
@ -257,7 +270,9 @@ class Updatable(abc.ABC):
raise NotImplementedError
@abc.abstractmethod
def update(self, update_requests: list[UpdateRequest]) -> None:
def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
"""
Updates some set of chunks. The document and fields to update are specified in the update
requests. Each update request in the list applies its changes to a list of document ids.

View File

@ -13,11 +13,11 @@ from datetime import timedelta
from typing import BinaryIO
from typing import cast
from typing import List
from uuid import UUID
import httpx # type: ignore
import requests # type: ignore
from onyx.configs.app_configs import DOCUMENT_INDEX_NAME
from onyx.configs.chat_configs import DOC_TIME_DECAY
from onyx.configs.chat_configs import NUM_RETURNED_HITS
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@ -25,7 +25,8 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
from onyx.configs.constants import KV_REINDEX_KEY
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.document_index.document_index_utils import assemble_document_chunk_info
from onyx.db.engine import get_session_with_tenant
from onyx.document_index.document_index_utils import get_document_chunk_ids
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentInsertionRecord
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
@ -35,9 +36,6 @@ from onyx.document_index.interfaces import UpdateRequest
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
from onyx.document_index.vespa.chunk_retrieval import (
get_all_vespa_ids_for_document_id,
)
from onyx.document_index.vespa.chunk_retrieval import (
parallel_visit_api_retrieval,
)
@ -46,6 +44,9 @@ from onyx.document_index.vespa.deletion import delete_vespa_chunks
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.utils import (
replace_invalid_doc_id_characters,
@ -337,40 +338,25 @@ class VespaIndex(DocumentIndex):
# documents that have `chunk_count` in the database, but not for
# `old_version` documents.
enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = []
for document_id, doc_count in doc_id_to_previous_chunk_cnt.items():
last_indexed_chunk = doc_id_to_previous_chunk_cnt.get(document_id, None)
# If the document has no `chunk_count` in the database, we know that it
# has the old chunk ID system and we must check for the final chunk index
is_old_version = False
if last_indexed_chunk is None:
is_old_version = True
minimal_doc_info = MinimalDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
)
last_indexed_chunk = check_for_final_chunk_existence(
minimal_doc_info=minimal_doc_info,
start_index=doc_id_to_new_chunk_cnt[document_id],
index_name=self.index_name,
http_client=http_client,
)
# If the document has previously indexed chunks, we know it previously existed
if doc_count or last_indexed_chunk:
existing_docs.add(document_id)
enriched_doc_info = EnrichedDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
chunk_end_index=last_indexed_chunk,
old_version=is_old_version,
enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [
VespaIndex.enrich_basic_chunk_info(
index_name=self.index_name,
http_client=http_client,
document_id=doc_id,
previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id, 0),
new_chunk_count=doc_id_to_new_chunk_cnt.get(doc_id, 0),
)
enriched_doc_infos.append(enriched_doc_info)
for doc_id in doc_id_to_new_chunk_cnt.keys()
]
for cleaned_doc_info in enriched_doc_infos:
# If the document has previously indexed chunks, we know it previously existed
if cleaned_doc_info.chunk_end_index:
existing_docs.add(cleaned_doc_info.doc_id)
# Now, for each doc, we know exactly where to start and end our deletion
# So let's generate the chunk IDs for each chunk to delete
chunks_to_delete = assemble_document_chunk_info(
chunks_to_delete = get_document_chunk_ids(
enriched_document_info_list=enriched_doc_infos,
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
@ -447,21 +433,21 @@ class VespaIndex(DocumentIndex):
failure_msg = f"Failed to update document: {future_to_document_id[future]}"
raise requests.HTTPError(failure_msg) from e
def update(self, update_requests: list[UpdateRequest]) -> None:
def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
logger.debug(f"Updating {len(update_requests)} documents in Vespa")
# Handle Vespa character limitations
# Mutating update_requests but it's not used later anyway
for update_request in update_requests:
update_request.document_ids = [
replace_invalid_doc_id_characters(doc_id)
for doc_id in update_request.document_ids
]
for doc_info in update_request.minimal_document_indexing_info:
doc_info.doc_id = replace_invalid_doc_id_characters(doc_info.doc_id)
update_start = time.monotonic()
processed_updates_requests: list[_VespaUpdateRequest] = []
all_doc_chunk_ids: dict[str, list[str]] = {}
all_doc_chunk_ids: dict[str, list[UUID]] = {}
# Fetch all chunks for each document ahead of time
index_names = [self.index_name]
@ -469,30 +455,24 @@ class VespaIndex(DocumentIndex):
index_names.append(self.secondary_index_name)
chunk_id_start_time = time.monotonic()
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
future_to_doc_chunk_ids = {
executor.submit(
get_all_vespa_ids_for_document_id,
document_id=document_id,
index_name=index_name,
filters=None,
get_large_chunks=True,
): (document_id, index_name)
for index_name in index_names
for update_request in update_requests
for document_id in update_request.document_ids
}
for future in concurrent.futures.as_completed(future_to_doc_chunk_ids):
document_id, index_name = future_to_doc_chunk_ids[future]
try:
doc_chunk_ids = future.result()
if document_id not in all_doc_chunk_ids:
all_doc_chunk_ids[document_id] = []
all_doc_chunk_ids[document_id].extend(doc_chunk_ids)
except Exception as e:
logger.error(
f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}"
)
with get_vespa_http_client() as http_client:
for update_request in update_requests:
for doc_info in update_request.minimal_document_indexing_info:
for index_name in index_names:
doc_chunk_info = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
document_id=doc_info.doc_id,
previous_chunk_count=doc_info.chunk_start_index,
new_chunk_count=0,
)
doc_chunk_ids = get_document_chunk_ids(
enriched_document_info_list=[doc_chunk_info],
tenant_id=tenant_id,
large_chunks_enabled=False,
)
all_doc_chunk_ids[doc_info.doc_id] = doc_chunk_ids
logger.debug(
f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs"
)
@ -521,11 +501,11 @@ class VespaIndex(DocumentIndex):
logger.error("Update request received but nothing to update")
continue
for document_id in update_request.document_ids:
for doc_chunk_id in all_doc_chunk_ids[document_id]:
for doc_info in update_request.minimal_document_indexing_info:
for doc_chunk_id in all_doc_chunk_ids[doc_info.doc_id]:
processed_updates_requests.append(
_VespaUpdateRequest(
document_id=document_id,
document_id=doc_info.doc_id,
url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}",
update_request=update_dict,
)
@ -537,36 +517,70 @@ class VespaIndex(DocumentIndex):
time.monotonic() - update_start,
)
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
def update_single_chunk(
self,
doc_chunk_id: UUID,
index_name: str,
fields: VespaDocumentFields,
doc_id: str,
) -> None:
"""
Update a single "chunk" (document) in Vespa using its chunk ID.
"""
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
# WeightedSet<string> needs a map { item: weight, ... }
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
# Similar to above
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update.")
return
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
with get_vespa_http_client(http2=False) as http_client:
try:
resp = http_client.put(
vespa_url,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
logger.error(error_message)
raise
def update_single(
self,
doc_id: str,
*,
chunk_count: int | None,
tenant_id: str | None,
fields: VespaDocumentFields,
) -> int:
"""Note: if the document id does not exist, the update will be a no-op and the
function will complete with no errors or exceptions.
Handle other exceptions if you wish to implement retry behavior
"""
total_chunks_updated = 0
# Handle Vespa character limitations
# Mutating update_request but it's not used later anyway
normalized_doc_id = replace_invalid_doc_id_characters(doc_id)
# Build the _VespaUpdateRequest objects
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update")
return 0
doc_chunk_count = 0
index_names = [self.index_name]
if self.secondary_index_name:
@ -574,66 +588,51 @@ class VespaIndex(DocumentIndex):
with get_vespa_http_client(http2=False) as http_client:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{normalized_doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
multipass_config = get_multipass_config(
db_session=db_session,
primary_index=index_name == self.index_name,
)
large_chunks_enabled = multipass_config.enable_large_chunks
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
document_id=doc_id,
previous_chunk_count=chunk_count,
new_chunk_count=0,
)
while True:
try:
vespa_url = (
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}"
)
logger.debug(f'update_single PUT on URL "{vespa_url}"')
resp = http_client.put(
vespa_url,
params=params,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to update chunks, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_updated = resp_data["documentCount"]
total_chunks_updated += chunks_updated
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.update_single: "
f"index={index_name} "
f"doc={normalized_doc_id} "
f"chunks_updated={total_chunks_updated}"
doc_chunk_ids = get_document_chunk_ids(
enriched_document_info_list=[enriched_doc_infos],
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
)
return total_chunks_updated
logger.info("UPDATING len(doc_chunk_ids)")
def delete_single(self, doc_id: str) -> int:
"""Possibly faster overall than the delete method due to using a single
delete call with a selection query."""
doc_chunk_count += len(doc_chunk_ids)
for doc_chunk_id in doc_chunk_ids:
logger.info("UPDATING CHUNK")
self.update_single_chunk(
doc_chunk_id=doc_chunk_id,
index_name=index_name,
fields=fields,
doc_id=doc_id,
)
logger.info(f"UPDATED A TOTAL OF {doc_chunk_count} CHUNKS for {doc_id}")
return doc_chunk_count
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
total_chunks_deleted = 0
# Vespa deletion is poorly documented ... luckily we found this
# https://docs.vespa.ai/en/operations/batch-delete.html#example
doc_id = replace_invalid_doc_id_characters(doc_id)
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
@ -642,53 +641,41 @@ class VespaIndex(DocumentIndex):
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with get_vespa_http_client(http2=False) as http_client:
with get_vespa_http_client(
http2=False
) as http_client, concurrent.futures.ThreadPoolExecutor(
max_workers=NUM_THREADS
) as executor:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
multipass_config = get_multipass_config(
db_session=db_session,
primary_index=index_name == self.index_name,
)
large_chunks_enabled = multipass_config.enable_large_chunks
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
document_id=doc_id,
previous_chunk_count=chunk_count,
new_chunk_count=0,
)
while True:
try:
vespa_url = (
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}"
)
logger.debug(f'delete_single DELETE on URL "{vespa_url}"')
resp = http_client.delete(
vespa_url,
params=params,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to delete chunk, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_deleted = resp_data["documentCount"]
total_chunks_deleted += chunks_deleted
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.delete_single: "
f"index={index_name} "
f"doc={doc_id} "
f"chunks_deleted={total_chunks_deleted}"
chunks_to_delete = get_document_chunk_ids(
enriched_document_info_list=[enriched_doc_infos],
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
)
for doc_chunk_ids_batch in batch_generator(
chunks_to_delete, BATCH_SIZE
):
total_chunks_deleted += len(doc_chunk_ids_batch)
delete_vespa_chunks(
doc_chunk_ids=doc_chunk_ids_batch,
index_name=index_name,
http_client=http_client,
executor=executor,
)
return total_chunks_deleted
@ -787,8 +774,51 @@ class VespaIndex(DocumentIndex):
return query_vespa(params)
# Retrieves chunk information for a document:
# - Determines the last indexed chunk
# - Identifies if the document uses the old or new chunk ID system
# This data is crucial for Vespa document updates without relying on the visit API.
@classmethod
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None:
def enrich_basic_chunk_info(
cls,
index_name: str,
http_client: httpx.Client,
document_id: str,
previous_chunk_count: int | None = None,
new_chunk_count: int = 0,
) -> EnrichedDocumentIndexingInfo:
last_indexed_chunk = previous_chunk_count
# If the document has no `chunk_count` in the database, we know that it
# has the old chunk ID system and we must check for the final chunk index
is_old_version = False
if last_indexed_chunk is None:
is_old_version = True
minimal_doc_info = MinimalDocumentIndexingInfo(
doc_id=document_id, chunk_start_index=new_chunk_count
)
last_indexed_chunk = check_for_final_chunk_existence(
minimal_doc_info=minimal_doc_info,
start_index=new_chunk_count,
index_name=index_name,
http_client=http_client,
)
enriched_doc_info = EnrichedDocumentIndexingInfo(
doc_id=document_id,
chunk_start_index=new_chunk_count,
chunk_end_index=last_indexed_chunk,
old_version=is_old_version,
)
return enriched_doc_info
@classmethod
def delete_entries_by_tenant_id(
cls,
*,
tenant_id: str,
index_name: str,
) -> None:
"""
Deletes all entries in the specified index with the given tenant_id.

View File

@ -7,10 +7,15 @@ from http import HTTPStatus
import httpx
from retry import retry
from sqlalchemy.orm import Session
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.document_index_utils import get_uuid_from_chunk
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
@ -45,6 +50,8 @@ from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import EmbeddingProvider
from onyx.indexing.models import MultipassConfig
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -129,7 +136,9 @@ def _index_vespa_chunk(
document = chunk.source_document
# No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
vespa_chunk_id = str(get_uuid_from_chunk(chunk))
embeddings = chunk.embeddings
embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
@ -263,5 +272,49 @@ def check_for_final_chunk_existence(
)
if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client):
return index
index += 1
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
"""
Determines whether multipass should be used based on the search settings
or the default config if settings are unavailable.
"""
if search_settings is not None:
return search_settings.multipass_indexing
return ENABLE_MULTIPASS_INDEXING
def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool:
"""
Given multipass usage and an embedder, decides whether large chunks are allowed
based on model/provider constraints.
"""
# Only local models that support a larger context are from Nomic
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
return (
multipass
and search_settings.model_name.startswith("nomic-ai")
and search_settings.provider_type != EmbeddingProvider.COHERE
)
def get_multipass_config(
db_session: Session, primary_index: bool = True
) -> MultipassConfig:
"""
Determines whether to enable multipass and large chunks by examining
the current search settings and the embedder configuration.
"""
search_settings = (
get_current_search_settings(db_session)
if primary_index
else get_secondary_search_settings(db_session)
)
multipass = should_use_multipass(search_settings)
if not search_settings:
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
enable_large_chunks = can_use_large_chunks(multipass, search_settings)
return MultipassConfig(
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
)

View File

@ -11,7 +11,6 @@ from sqlalchemy.orm import Session
from onyx.access.access import get_access_for_documents
from onyx.access.models import DocumentAccess
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
from onyx.configs.constants import DEFAULT_BOOST
@ -31,12 +30,14 @@ from onyx.db.document import upsert_documents
from onyx.db.document_set import fetch_document_sets_for_documents
from onyx.db.index_attempt import create_index_attempt_error
from onyx.db.models import Document as DBDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag
from onyx.db.tag import create_or_add_document_tag_list
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import IndexingEmbedder
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
@ -44,7 +45,6 @@ from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.utils.logger import setup_logger
from onyx.utils.timing import log_function_time
from shared_configs.enums import EmbeddingProvider
logger = setup_logger()
@ -479,28 +479,6 @@ def index_doc_batch(
return result
def check_enable_large_chunks_and_multipass(
embedder: IndexingEmbedder, db_session: Session
) -> tuple[bool, bool]:
search_settings = get_current_search_settings(db_session)
multipass = (
search_settings.multipass_indexing
if search_settings
else ENABLE_MULTIPASS_INDEXING
)
enable_large_chunks = (
multipass
and
# Only local models that supports larger context are from Nomic
(embedder.model_name.startswith("nomic-ai"))
and
# Cohere does not support larger context they recommend not going above 512 tokens
embedder.provider_type != EmbeddingProvider.COHERE
)
return multipass, enable_large_chunks
def build_indexing_pipeline(
*,
embedder: IndexingEmbedder,
@ -513,14 +491,12 @@ def build_indexing_pipeline(
callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol:
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
multipass, enable_large_chunks = check_enable_large_chunks_and_multipass(
embedder, db_session
)
multipass_config = get_multipass_config(db_session, primary_index=True)
chunker = chunker or Chunker(
tokenizer=embedder.embedding_model.tokenizer,
enable_multipass=multipass,
enable_large_chunks=enable_large_chunks,
enable_multipass=multipass_config.multipass_indexing,
enable_large_chunks=multipass_config.enable_large_chunks,
# after every doc, update status in case there are a bunch of really long docs
callback=callback,
)

View File

@ -154,3 +154,8 @@ class IndexingSetting(EmbeddingModelDetail):
index_name=search_settings.index_name,
multipass_indexing=search_settings.multipass_indexing,
)
class MultipassConfig(BaseModel):
multipass_indexing: bool
enable_large_chunks: bool

View File

@ -5,6 +5,7 @@ import sys
from sqlalchemy import delete
from sqlalchemy.orm import Session
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus
# Modify sys.path
@ -38,7 +39,6 @@ from onyx.db.engine import get_session_context_manager
from onyx.document_index.factory import get_default_document_index
from onyx.file_store.file_store import get_default_file_store
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.db.document import delete_documents_complete__no_commit
# pylint: enable=E402
# flake8: noqa: E402
@ -71,13 +71,16 @@ def _unsafe_deletion(
if not documents:
break
document_ids = [document.id for document in documents]
for doc_id in document_ids:
document_index.delete_single(doc_id)
for document in documents:
document_index.delete_single(
doc_id=document.id,
tenant_id=None,
chunk_count=document.chunk_count,
)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=document_ids,
document_ids=[document.id for document in documents],
)
num_docs_deleted += len(documents)
@ -216,6 +219,7 @@ if __name__ == "__main__":
parser.add_argument(
"connector_id", type=int, help="The ID of the connector to delete"
)
args = parser.parse_args()
with get_session_context_manager() as db_session:
_delete_connector(args.connector_id, db_session)

View File

@ -15,6 +15,7 @@ from onyx.db.engine import get_session_context_manager # noqa: E402
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
from onyx.db.search_settings import get_current_search_settings # noqa: E402
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
from onyx.db.document import get_document # noqa: E402
BATCH_SIZE = 100
@ -63,6 +64,9 @@ def main() -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
def process_doc(doc_id: str) -> str | None:
document = get_document(doc_id, db_session)
if not document:
return None
# Check if document exists in Vespa first
try:
chunks = vespa_index.id_based_retrieval(
@ -83,7 +87,9 @@ def main() -> None:
try:
print(f"Deleting document {doc_id} in Vespa")
chunks_deleted = vespa_index.delete_single(doc_id)
chunks_deleted = vespa_index.delete_single(
doc_id, tenant_id=None, chunk_count=document.chunk_count
)
if chunks_deleted > 0:
print(
f"Deleted {chunks_deleted} chunks for document {doc_id}"

View File

View File

View File

@ -21,8 +21,9 @@ def _verify_document_permissions(
group_names: list[str] | None = None,
doc_creating_user: DATestUser | None = None,
) -> None:
acl_keys = set(retrieved_doc["access_control_list"].keys())
acl_keys = set(retrieved_doc.get("access_control_list", {}).keys())
print(f"ACL keys: {acl_keys}")
if cc_pair.access_type == AccessType.PUBLIC:
if "PUBLIC" not in acl_keys:
raise ValueError(
@ -42,8 +43,9 @@ def _verify_document_permissions(
found_group_keys = {key for key in acl_keys if key.startswith("group:")}
if found_group_keys != expected_group_keys:
raise ValueError(
f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. Found: {found_group_keys}, \n"
f"Expected: {expected_group_keys}"
f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. "
f"Expected: {expected_group_keys} Found: {found_group_keys}\n"
f"All ACL keys: {acl_keys}"
)
if doc_set_names is not None:
@ -153,9 +155,11 @@ class DocumentManager:
) -> None:
doc_ids = [document.id for document in cc_pair.documents]
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
retrieved_docs = {
doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict
}
# Left this here for debugging purposes.
# import json
# for doc in retrieved_docs.values():