mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 20:39:29 +02:00
Fully remove visit API (#3621)
* v1 * update indexing logic * update updates * nit * clean up args * update for clarity + best practices * nit + logs * fix * minor clean up * remove logs * quick nit
This commit is contained in:
parent
eac73a1bf1
commit
d7bc32c0ec
@ -28,13 +28,35 @@ class RetryDocumentIndex:
|
||||
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
|
||||
stop=stop_after_delay(STOP_AFTER),
|
||||
)
|
||||
def delete_single(self, doc_id: str) -> int:
|
||||
return self.index.delete_single(doc_id)
|
||||
def delete_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
tenant_id: str | None,
|
||||
chunk_count: int | None,
|
||||
) -> int:
|
||||
return self.index.delete_single(
|
||||
doc_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(httpx.ReadTimeout),
|
||||
wait=wait_random_exponential(multiplier=1, max=MAX_WAIT),
|
||||
stop=stop_after_delay(STOP_AFTER),
|
||||
)
|
||||
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
|
||||
return self.index.update_single(doc_id, fields)
|
||||
def update_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
tenant_id: str | None,
|
||||
chunk_count: int | None,
|
||||
fields: VespaDocumentFields,
|
||||
) -> int:
|
||||
return self.index.update_single(
|
||||
doc_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
fields=fields,
|
||||
)
|
||||
|
@ -12,6 +12,7 @@ from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocument
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
from onyx.db.document import fetch_chunk_count_for_document
|
||||
from onyx.db.document import get_document
|
||||
from onyx.db.document import get_document_connector_count
|
||||
from onyx.db.document import mark_document_as_modified
|
||||
@ -80,7 +81,13 @@ def document_by_cc_pair_cleanup_task(
|
||||
# delete it from vespa and the db
|
||||
action = "delete"
|
||||
|
||||
chunks_affected = retry_index.delete_single(document_id)
|
||||
chunk_count = fetch_chunk_count_for_document(document_id, db_session)
|
||||
|
||||
chunks_affected = retry_index.delete_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
delete_documents_complete__no_commit(
|
||||
db_session=db_session,
|
||||
document_ids=[document_id],
|
||||
@ -110,7 +117,12 @@ def document_by_cc_pair_cleanup_task(
|
||||
)
|
||||
|
||||
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
||||
chunks_affected = retry_index.update_single(document_id, fields=fields)
|
||||
chunks_affected = retry_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
# there are still other cc_pair references to the doc, so just resync to Vespa
|
||||
delete_document_by_connector_credential_pair__no_commit(
|
||||
|
@ -992,7 +992,12 @@ def vespa_metadata_sync_task(
|
||||
)
|
||||
|
||||
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
||||
chunks_affected = retry_index.update_single(document_id, fields)
|
||||
chunks_affected = retry_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
# update db last. Worst case = we crash right before this and
|
||||
# the sync might repeat again later
|
||||
|
@ -685,20 +685,27 @@ def get_document_sources(
|
||||
def fetch_chunk_counts_for_documents(
|
||||
document_ids: list[str],
|
||||
db_session: Session,
|
||||
) -> list[tuple[str, int | None]]:
|
||||
) -> list[tuple[str, int]]:
|
||||
"""
|
||||
Return a list of (document_id, chunk_count) tuples.
|
||||
Note: chunk_count might be None if not set in DB,
|
||||
so we declare it as Optional[int].
|
||||
If a document_id is not found in the database, it will be returned with a chunk_count of 0.
|
||||
"""
|
||||
stmt = select(DbDocument.id, DbDocument.chunk_count).where(
|
||||
DbDocument.id.in_(document_ids)
|
||||
)
|
||||
|
||||
# results is a list of 'Row' objects, each containing two columns
|
||||
results = db_session.execute(stmt).all()
|
||||
|
||||
# If DbDocument.id is guaranteed to be a string, you can just do row.id;
|
||||
# otherwise cast to str if you need to be sure it's a string:
|
||||
return [(str(row[0]), row[1]) for row in results]
|
||||
# or row.id, row.chunk_count if they are named attributes in your ORM model
|
||||
# Create a dictionary of document_id to chunk_count
|
||||
chunk_counts = {str(row.id): row.chunk_count or 0 for row in results}
|
||||
|
||||
# Return a list of tuples, using 0 for documents not found in the database
|
||||
return [(doc_id, chunk_counts.get(doc_id, 0)) for doc_id in document_ids]
|
||||
|
||||
|
||||
def fetch_chunk_count_for_document(
|
||||
document_id: str,
|
||||
db_session: Session,
|
||||
) -> int | None:
|
||||
stmt = select(DbDocument.chunk_count).where(DbDocument.id == document_id)
|
||||
return db_session.execute(stmt).scalar_one_or_none()
|
||||
|
@ -8,7 +8,7 @@ from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
DEFAULT_BATCH_SIZE = 30
|
||||
DEFAULT_INDEX_NAME = "danswer_chunk"
|
||||
@ -37,7 +37,10 @@ def translate_boost_count_to_multiplier(boost: int) -> float:
|
||||
return 2 / (1 + math.exp(-1 * boost / 3))
|
||||
|
||||
|
||||
def assemble_document_chunk_info(
|
||||
# Assembles a list of Vespa chunk IDs for a document
|
||||
# given the required context. This can be used to directly query
|
||||
# Vespa's Document API.
|
||||
def get_document_chunk_ids(
|
||||
enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
|
||||
tenant_id: str | None,
|
||||
large_chunks_enabled: bool,
|
||||
@ -110,10 +113,11 @@ def get_uuid_from_chunk_info(
|
||||
"large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
|
||||
)
|
||||
unique_identifier_string = "_".join([doc_str, chunk_index])
|
||||
if tenant_id:
|
||||
if tenant_id and MULTI_TENANT:
|
||||
unique_identifier_string += "_" + tenant_id
|
||||
|
||||
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
||||
uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
||||
return uuid_value
|
||||
|
||||
|
||||
def get_uuid_from_chunk_info_old(
|
||||
|
@ -109,7 +109,7 @@ class UpdateRequest:
|
||||
Does not update any of the None fields
|
||||
"""
|
||||
|
||||
document_ids: list[str]
|
||||
minimal_document_indexing_info: list[MinimalDocumentIndexingInfo]
|
||||
# all other fields except these 4 will always be left alone by the update request
|
||||
access: DocumentAccess | None = None
|
||||
document_sets: set[str] | None = None
|
||||
@ -136,7 +136,7 @@ class Verifiable(abc.ABC):
|
||||
index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
*args: Any,
|
||||
**kwargs: Any
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.index_name = index_name
|
||||
@ -218,7 +218,13 @@ class Deletable(abc.ABC):
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete_single(self, doc_id: str) -> int:
|
||||
def delete_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
tenant_id: str | None,
|
||||
chunk_count: int | None,
|
||||
) -> int:
|
||||
"""
|
||||
Given a single document id, hard delete it from the document index
|
||||
|
||||
@ -239,7 +245,14 @@ class Updatable(abc.ABC):
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
|
||||
def update_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
tenant_id: str | None,
|
||||
chunk_count: int | None,
|
||||
fields: VespaDocumentFields,
|
||||
) -> int:
|
||||
"""
|
||||
Updates all chunks for a document with the specified fields.
|
||||
None values mean that the field does not need an update.
|
||||
@ -257,7 +270,9 @@ class Updatable(abc.ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
def update(
|
||||
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
|
||||
) -> None:
|
||||
"""
|
||||
Updates some set of chunks. The document and fields to update are specified in the update
|
||||
requests. Each update request in the list applies its changes to a list of document ids.
|
||||
|
@ -13,11 +13,11 @@ from datetime import timedelta
|
||||
from typing import BinaryIO
|
||||
from typing import cast
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
|
||||
import httpx # type: ignore
|
||||
import requests # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from onyx.configs.chat_configs import DOC_TIME_DECAY
|
||||
from onyx.configs.chat_configs import NUM_RETURNED_HITS
|
||||
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
|
||||
@ -25,7 +25,8 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
|
||||
from onyx.configs.constants import KV_REINDEX_KEY
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.document_index.document_index_utils import assemble_document_chunk_info
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.document_index.document_index_utils import get_document_chunk_ids
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentInsertionRecord
|
||||
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
|
||||
@ -35,9 +36,6 @@ from onyx.document_index.interfaces import UpdateRequest
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
get_all_vespa_ids_for_document_id,
|
||||
)
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
parallel_visit_api_retrieval,
|
||||
)
|
||||
@ -46,6 +44,9 @@ from onyx.document_index.vespa.deletion import delete_vespa_chunks
|
||||
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
|
||||
from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
|
||||
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
|
||||
from onyx.document_index.vespa.indexing_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
@ -337,40 +338,25 @@ class VespaIndex(DocumentIndex):
|
||||
# documents that have `chunk_count` in the database, but not for
|
||||
# `old_version` documents.
|
||||
|
||||
enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = []
|
||||
for document_id, doc_count in doc_id_to_previous_chunk_cnt.items():
|
||||
last_indexed_chunk = doc_id_to_previous_chunk_cnt.get(document_id, None)
|
||||
# If the document has no `chunk_count` in the database, we know that it
|
||||
# has the old chunk ID system and we must check for the final chunk index
|
||||
is_old_version = False
|
||||
if last_indexed_chunk is None:
|
||||
is_old_version = True
|
||||
minimal_doc_info = MinimalDocumentIndexingInfo(
|
||||
doc_id=document_id,
|
||||
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
|
||||
)
|
||||
last_indexed_chunk = check_for_final_chunk_existence(
|
||||
minimal_doc_info=minimal_doc_info,
|
||||
start_index=doc_id_to_new_chunk_cnt[document_id],
|
||||
index_name=self.index_name,
|
||||
http_client=http_client,
|
||||
)
|
||||
|
||||
# If the document has previously indexed chunks, we know it previously existed
|
||||
if doc_count or last_indexed_chunk:
|
||||
existing_docs.add(document_id)
|
||||
|
||||
enriched_doc_info = EnrichedDocumentIndexingInfo(
|
||||
doc_id=document_id,
|
||||
chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
|
||||
chunk_end_index=last_indexed_chunk,
|
||||
old_version=is_old_version,
|
||||
enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [
|
||||
VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=self.index_name,
|
||||
http_client=http_client,
|
||||
document_id=doc_id,
|
||||
previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id, 0),
|
||||
new_chunk_count=doc_id_to_new_chunk_cnt.get(doc_id, 0),
|
||||
)
|
||||
enriched_doc_infos.append(enriched_doc_info)
|
||||
for doc_id in doc_id_to_new_chunk_cnt.keys()
|
||||
]
|
||||
|
||||
for cleaned_doc_info in enriched_doc_infos:
|
||||
# If the document has previously indexed chunks, we know it previously existed
|
||||
if cleaned_doc_info.chunk_end_index:
|
||||
existing_docs.add(cleaned_doc_info.doc_id)
|
||||
|
||||
# Now, for each doc, we know exactly where to start and end our deletion
|
||||
# So let's generate the chunk IDs for each chunk to delete
|
||||
chunks_to_delete = assemble_document_chunk_info(
|
||||
chunks_to_delete = get_document_chunk_ids(
|
||||
enriched_document_info_list=enriched_doc_infos,
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=large_chunks_enabled,
|
||||
@ -447,21 +433,21 @@ class VespaIndex(DocumentIndex):
|
||||
failure_msg = f"Failed to update document: {future_to_document_id[future]}"
|
||||
raise requests.HTTPError(failure_msg) from e
|
||||
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
def update(
|
||||
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
|
||||
) -> None:
|
||||
logger.debug(f"Updating {len(update_requests)} documents in Vespa")
|
||||
|
||||
# Handle Vespa character limitations
|
||||
# Mutating update_requests but it's not used later anyway
|
||||
for update_request in update_requests:
|
||||
update_request.document_ids = [
|
||||
replace_invalid_doc_id_characters(doc_id)
|
||||
for doc_id in update_request.document_ids
|
||||
]
|
||||
for doc_info in update_request.minimal_document_indexing_info:
|
||||
doc_info.doc_id = replace_invalid_doc_id_characters(doc_info.doc_id)
|
||||
|
||||
update_start = time.monotonic()
|
||||
|
||||
processed_updates_requests: list[_VespaUpdateRequest] = []
|
||||
all_doc_chunk_ids: dict[str, list[str]] = {}
|
||||
all_doc_chunk_ids: dict[str, list[UUID]] = {}
|
||||
|
||||
# Fetch all chunks for each document ahead of time
|
||||
index_names = [self.index_name]
|
||||
@ -469,30 +455,24 @@ class VespaIndex(DocumentIndex):
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
chunk_id_start_time = time.monotonic()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
|
||||
future_to_doc_chunk_ids = {
|
||||
executor.submit(
|
||||
get_all_vespa_ids_for_document_id,
|
||||
document_id=document_id,
|
||||
index_name=index_name,
|
||||
filters=None,
|
||||
get_large_chunks=True,
|
||||
): (document_id, index_name)
|
||||
for index_name in index_names
|
||||
for update_request in update_requests
|
||||
for document_id in update_request.document_ids
|
||||
}
|
||||
for future in concurrent.futures.as_completed(future_to_doc_chunk_ids):
|
||||
document_id, index_name = future_to_doc_chunk_ids[future]
|
||||
try:
|
||||
doc_chunk_ids = future.result()
|
||||
if document_id not in all_doc_chunk_ids:
|
||||
all_doc_chunk_ids[document_id] = []
|
||||
all_doc_chunk_ids[document_id].extend(doc_chunk_ids)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}"
|
||||
)
|
||||
with get_vespa_http_client() as http_client:
|
||||
for update_request in update_requests:
|
||||
for doc_info in update_request.minimal_document_indexing_info:
|
||||
for index_name in index_names:
|
||||
doc_chunk_info = VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
document_id=doc_info.doc_id,
|
||||
previous_chunk_count=doc_info.chunk_start_index,
|
||||
new_chunk_count=0,
|
||||
)
|
||||
doc_chunk_ids = get_document_chunk_ids(
|
||||
enriched_document_info_list=[doc_chunk_info],
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=False,
|
||||
)
|
||||
all_doc_chunk_ids[doc_info.doc_id] = doc_chunk_ids
|
||||
|
||||
logger.debug(
|
||||
f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs"
|
||||
)
|
||||
@ -521,11 +501,11 @@ class VespaIndex(DocumentIndex):
|
||||
logger.error("Update request received but nothing to update")
|
||||
continue
|
||||
|
||||
for document_id in update_request.document_ids:
|
||||
for doc_chunk_id in all_doc_chunk_ids[document_id]:
|
||||
for doc_info in update_request.minimal_document_indexing_info:
|
||||
for doc_chunk_id in all_doc_chunk_ids[doc_info.doc_id]:
|
||||
processed_updates_requests.append(
|
||||
_VespaUpdateRequest(
|
||||
document_id=document_id,
|
||||
document_id=doc_info.doc_id,
|
||||
url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}",
|
||||
update_request=update_dict,
|
||||
)
|
||||
@ -537,36 +517,70 @@ class VespaIndex(DocumentIndex):
|
||||
time.monotonic() - update_start,
|
||||
)
|
||||
|
||||
def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int:
|
||||
def update_single_chunk(
|
||||
self,
|
||||
doc_chunk_id: UUID,
|
||||
index_name: str,
|
||||
fields: VespaDocumentFields,
|
||||
doc_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Update a single "chunk" (document) in Vespa using its chunk ID.
|
||||
"""
|
||||
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
|
||||
if fields.document_sets is not None:
|
||||
# WeightedSet<string> needs a map { item: weight, ... }
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
|
||||
if fields.access is not None:
|
||||
# Similar to above
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update.")
|
||||
return
|
||||
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
|
||||
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
try:
|
||||
resp = http_client.put(
|
||||
vespa_url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
|
||||
logger.error(error_message)
|
||||
raise
|
||||
|
||||
def update_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
chunk_count: int | None,
|
||||
tenant_id: str | None,
|
||||
fields: VespaDocumentFields,
|
||||
) -> int:
|
||||
"""Note: if the document id does not exist, the update will be a no-op and the
|
||||
function will complete with no errors or exceptions.
|
||||
Handle other exceptions if you wish to implement retry behavior
|
||||
"""
|
||||
|
||||
total_chunks_updated = 0
|
||||
|
||||
# Handle Vespa character limitations
|
||||
# Mutating update_request but it's not used later anyway
|
||||
normalized_doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||
|
||||
# Build the _VespaUpdateRequest objects
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update")
|
||||
return 0
|
||||
doc_chunk_count = 0
|
||||
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
@ -574,66 +588,51 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
"selection": f"{index_name}.document_id=='{normalized_doc_id}'",
|
||||
"cluster": DOCUMENT_INDEX_NAME,
|
||||
}
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
multipass_config = get_multipass_config(
|
||||
db_session=db_session,
|
||||
primary_index=index_name == self.index_name,
|
||||
)
|
||||
large_chunks_enabled = multipass_config.enable_large_chunks
|
||||
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
document_id=doc_id,
|
||||
previous_chunk_count=chunk_count,
|
||||
new_chunk_count=0,
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
vespa_url = (
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}"
|
||||
)
|
||||
logger.debug(f'update_single PUT on URL "{vespa_url}"')
|
||||
resp = http_client.put(
|
||||
vespa_url,
|
||||
params=params,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
)
|
||||
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(
|
||||
f"Failed to update chunks, details: {e.response.text}"
|
||||
)
|
||||
raise
|
||||
|
||||
resp_data = resp.json()
|
||||
|
||||
if "documentCount" in resp_data:
|
||||
chunks_updated = resp_data["documentCount"]
|
||||
total_chunks_updated += chunks_updated
|
||||
|
||||
# Check for continuation token to handle pagination
|
||||
if "continuation" not in resp_data:
|
||||
break # Exit loop if no continuation token
|
||||
|
||||
if not resp_data["continuation"]:
|
||||
break # Exit loop if continuation token is empty
|
||||
|
||||
params = params.set("continuation", resp_data["continuation"])
|
||||
|
||||
logger.debug(
|
||||
f"VespaIndex.update_single: "
|
||||
f"index={index_name} "
|
||||
f"doc={normalized_doc_id} "
|
||||
f"chunks_updated={total_chunks_updated}"
|
||||
doc_chunk_ids = get_document_chunk_ids(
|
||||
enriched_document_info_list=[enriched_doc_infos],
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=large_chunks_enabled,
|
||||
)
|
||||
|
||||
return total_chunks_updated
|
||||
logger.info("UPDATING len(doc_chunk_ids)")
|
||||
|
||||
def delete_single(self, doc_id: str) -> int:
|
||||
"""Possibly faster overall than the delete method due to using a single
|
||||
delete call with a selection query."""
|
||||
doc_chunk_count += len(doc_chunk_ids)
|
||||
|
||||
for doc_chunk_id in doc_chunk_ids:
|
||||
logger.info("UPDATING CHUNK")
|
||||
self.update_single_chunk(
|
||||
doc_chunk_id=doc_chunk_id,
|
||||
index_name=index_name,
|
||||
fields=fields,
|
||||
doc_id=doc_id,
|
||||
)
|
||||
logger.info(f"UPDATED A TOTAL OF {doc_chunk_count} CHUNKS for {doc_id}")
|
||||
|
||||
return doc_chunk_count
|
||||
|
||||
def delete_single(
|
||||
self,
|
||||
doc_id: str,
|
||||
*,
|
||||
tenant_id: str | None,
|
||||
chunk_count: int | None,
|
||||
) -> int:
|
||||
total_chunks_deleted = 0
|
||||
|
||||
# Vespa deletion is poorly documented ... luckily we found this
|
||||
# https://docs.vespa.ai/en/operations/batch-delete.html#example
|
||||
|
||||
doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
@ -642,53 +641,41 @@ class VespaIndex(DocumentIndex):
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
with get_vespa_http_client(
|
||||
http2=False
|
||||
) as http_client, concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=NUM_THREADS
|
||||
) as executor:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
"selection": f"{index_name}.document_id=='{doc_id}'",
|
||||
"cluster": DOCUMENT_INDEX_NAME,
|
||||
}
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
multipass_config = get_multipass_config(
|
||||
db_session=db_session,
|
||||
primary_index=index_name == self.index_name,
|
||||
)
|
||||
large_chunks_enabled = multipass_config.enable_large_chunks
|
||||
|
||||
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
document_id=doc_id,
|
||||
previous_chunk_count=chunk_count,
|
||||
new_chunk_count=0,
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
vespa_url = (
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}"
|
||||
)
|
||||
logger.debug(f'delete_single DELETE on URL "{vespa_url}"')
|
||||
resp = http_client.delete(
|
||||
vespa_url,
|
||||
params=params,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(
|
||||
f"Failed to delete chunk, details: {e.response.text}"
|
||||
)
|
||||
raise
|
||||
|
||||
resp_data = resp.json()
|
||||
|
||||
if "documentCount" in resp_data:
|
||||
chunks_deleted = resp_data["documentCount"]
|
||||
total_chunks_deleted += chunks_deleted
|
||||
|
||||
# Check for continuation token to handle pagination
|
||||
if "continuation" not in resp_data:
|
||||
break # Exit loop if no continuation token
|
||||
|
||||
if not resp_data["continuation"]:
|
||||
break # Exit loop if continuation token is empty
|
||||
|
||||
params = params.set("continuation", resp_data["continuation"])
|
||||
|
||||
logger.debug(
|
||||
f"VespaIndex.delete_single: "
|
||||
f"index={index_name} "
|
||||
f"doc={doc_id} "
|
||||
f"chunks_deleted={total_chunks_deleted}"
|
||||
chunks_to_delete = get_document_chunk_ids(
|
||||
enriched_document_info_list=[enriched_doc_infos],
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=large_chunks_enabled,
|
||||
)
|
||||
for doc_chunk_ids_batch in batch_generator(
|
||||
chunks_to_delete, BATCH_SIZE
|
||||
):
|
||||
total_chunks_deleted += len(doc_chunk_ids_batch)
|
||||
delete_vespa_chunks(
|
||||
doc_chunk_ids=doc_chunk_ids_batch,
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
executor=executor,
|
||||
)
|
||||
|
||||
return total_chunks_deleted
|
||||
|
||||
@ -787,8 +774,51 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
return query_vespa(params)
|
||||
|
||||
# Retrieves chunk information for a document:
|
||||
# - Determines the last indexed chunk
|
||||
# - Identifies if the document uses the old or new chunk ID system
|
||||
# This data is crucial for Vespa document updates without relying on the visit API.
|
||||
@classmethod
|
||||
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None:
|
||||
def enrich_basic_chunk_info(
|
||||
cls,
|
||||
index_name: str,
|
||||
http_client: httpx.Client,
|
||||
document_id: str,
|
||||
previous_chunk_count: int | None = None,
|
||||
new_chunk_count: int = 0,
|
||||
) -> EnrichedDocumentIndexingInfo:
|
||||
last_indexed_chunk = previous_chunk_count
|
||||
|
||||
# If the document has no `chunk_count` in the database, we know that it
|
||||
# has the old chunk ID system and we must check for the final chunk index
|
||||
is_old_version = False
|
||||
if last_indexed_chunk is None:
|
||||
is_old_version = True
|
||||
minimal_doc_info = MinimalDocumentIndexingInfo(
|
||||
doc_id=document_id, chunk_start_index=new_chunk_count
|
||||
)
|
||||
last_indexed_chunk = check_for_final_chunk_existence(
|
||||
minimal_doc_info=minimal_doc_info,
|
||||
start_index=new_chunk_count,
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
)
|
||||
|
||||
enriched_doc_info = EnrichedDocumentIndexingInfo(
|
||||
doc_id=document_id,
|
||||
chunk_start_index=new_chunk_count,
|
||||
chunk_end_index=last_indexed_chunk,
|
||||
old_version=is_old_version,
|
||||
)
|
||||
return enriched_doc_info
|
||||
|
||||
@classmethod
|
||||
def delete_entries_by_tenant_id(
|
||||
cls,
|
||||
*,
|
||||
tenant_id: str,
|
||||
index_name: str,
|
||||
) -> None:
|
||||
"""
|
||||
Deletes all entries in the specified index with the given tenant_id.
|
||||
|
||||
|
@ -7,10 +7,15 @@ from http import HTTPStatus
|
||||
|
||||
import httpx
|
||||
from retry import retry
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
|
||||
from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
|
||||
@ -45,6 +50,8 @@ from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.models import EmbeddingProvider
|
||||
from onyx.indexing.models import MultipassConfig
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -129,7 +136,9 @@ def _index_vespa_chunk(
|
||||
document = chunk.source_document
|
||||
|
||||
# No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
|
||||
|
||||
vespa_chunk_id = str(get_uuid_from_chunk(chunk))
|
||||
|
||||
embeddings = chunk.embeddings
|
||||
|
||||
embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
|
||||
@ -263,5 +272,49 @@ def check_for_final_chunk_existence(
|
||||
)
|
||||
if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client):
|
||||
return index
|
||||
|
||||
index += 1
|
||||
|
||||
|
||||
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
|
||||
"""
|
||||
Determines whether multipass should be used based on the search settings
|
||||
or the default config if settings are unavailable.
|
||||
"""
|
||||
if search_settings is not None:
|
||||
return search_settings.multipass_indexing
|
||||
return ENABLE_MULTIPASS_INDEXING
|
||||
|
||||
|
||||
def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool:
|
||||
"""
|
||||
Given multipass usage and an embedder, decides whether large chunks are allowed
|
||||
based on model/provider constraints.
|
||||
"""
|
||||
# Only local models that support a larger context are from Nomic
|
||||
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
|
||||
return (
|
||||
multipass
|
||||
and search_settings.model_name.startswith("nomic-ai")
|
||||
and search_settings.provider_type != EmbeddingProvider.COHERE
|
||||
)
|
||||
|
||||
|
||||
def get_multipass_config(
|
||||
db_session: Session, primary_index: bool = True
|
||||
) -> MultipassConfig:
|
||||
"""
|
||||
Determines whether to enable multipass and large chunks by examining
|
||||
the current search settings and the embedder configuration.
|
||||
"""
|
||||
search_settings = (
|
||||
get_current_search_settings(db_session)
|
||||
if primary_index
|
||||
else get_secondary_search_settings(db_session)
|
||||
)
|
||||
multipass = should_use_multipass(search_settings)
|
||||
if not search_settings:
|
||||
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
|
||||
enable_large_chunks = can_use_large_chunks(multipass, search_settings)
|
||||
return MultipassConfig(
|
||||
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
|
||||
)
|
||||
|
@ -11,7 +11,6 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.access.access import get_access_for_documents
|
||||
from onyx.access.models import DocumentAccess
|
||||
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
|
||||
from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT
|
||||
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
|
||||
from onyx.configs.constants import DEFAULT_BOOST
|
||||
@ -31,12 +30,14 @@ from onyx.db.document import upsert_documents
|
||||
from onyx.db.document_set import fetch_document_sets_for_documents
|
||||
from onyx.db.index_attempt import create_index_attempt_error
|
||||
from onyx.db.models import Document as DBDocument
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.tag import create_or_add_document_tag
|
||||
from onyx.db.tag import create_or_add_document_tag_list
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentMetadata
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
from onyx.document_index.vespa.indexing_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.indexing.embedder import IndexingEmbedder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
@ -44,7 +45,6 @@ from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.timing import log_function_time
|
||||
from shared_configs.enums import EmbeddingProvider
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@ -479,28 +479,6 @@ def index_doc_batch(
|
||||
return result
|
||||
|
||||
|
||||
def check_enable_large_chunks_and_multipass(
|
||||
embedder: IndexingEmbedder, db_session: Session
|
||||
) -> tuple[bool, bool]:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass = (
|
||||
search_settings.multipass_indexing
|
||||
if search_settings
|
||||
else ENABLE_MULTIPASS_INDEXING
|
||||
)
|
||||
|
||||
enable_large_chunks = (
|
||||
multipass
|
||||
and
|
||||
# Only local models that supports larger context are from Nomic
|
||||
(embedder.model_name.startswith("nomic-ai"))
|
||||
and
|
||||
# Cohere does not support larger context they recommend not going above 512 tokens
|
||||
embedder.provider_type != EmbeddingProvider.COHERE
|
||||
)
|
||||
return multipass, enable_large_chunks
|
||||
|
||||
|
||||
def build_indexing_pipeline(
|
||||
*,
|
||||
embedder: IndexingEmbedder,
|
||||
@ -513,14 +491,12 @@ def build_indexing_pipeline(
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> IndexingPipelineProtocol:
|
||||
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
|
||||
multipass, enable_large_chunks = check_enable_large_chunks_and_multipass(
|
||||
embedder, db_session
|
||||
)
|
||||
multipass_config = get_multipass_config(db_session, primary_index=True)
|
||||
|
||||
chunker = chunker or Chunker(
|
||||
tokenizer=embedder.embedding_model.tokenizer,
|
||||
enable_multipass=multipass,
|
||||
enable_large_chunks=enable_large_chunks,
|
||||
enable_multipass=multipass_config.multipass_indexing,
|
||||
enable_large_chunks=multipass_config.enable_large_chunks,
|
||||
# after every doc, update status in case there are a bunch of really long docs
|
||||
callback=callback,
|
||||
)
|
||||
|
@ -154,3 +154,8 @@ class IndexingSetting(EmbeddingModelDetail):
|
||||
index_name=search_settings.index_name,
|
||||
multipass_indexing=search_settings.multipass_indexing,
|
||||
)
|
||||
|
||||
|
||||
class MultipassConfig(BaseModel):
|
||||
multipass_indexing: bool
|
||||
enable_large_chunks: bool
|
||||
|
@ -5,6 +5,7 @@ import sys
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
|
||||
# Modify sys.path
|
||||
@ -38,7 +39,6 @@ from onyx.db.engine import get_session_context_manager
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
|
||||
# pylint: enable=E402
|
||||
# flake8: noqa: E402
|
||||
@ -71,13 +71,16 @@ def _unsafe_deletion(
|
||||
if not documents:
|
||||
break
|
||||
|
||||
document_ids = [document.id for document in documents]
|
||||
for doc_id in document_ids:
|
||||
document_index.delete_single(doc_id)
|
||||
for document in documents:
|
||||
document_index.delete_single(
|
||||
doc_id=document.id,
|
||||
tenant_id=None,
|
||||
chunk_count=document.chunk_count,
|
||||
)
|
||||
|
||||
delete_documents_complete__no_commit(
|
||||
db_session=db_session,
|
||||
document_ids=document_ids,
|
||||
document_ids=[document.id for document in documents],
|
||||
)
|
||||
|
||||
num_docs_deleted += len(documents)
|
||||
@ -216,6 +219,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"connector_id", type=int, help="The ID of the connector to delete"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
with get_session_context_manager() as db_session:
|
||||
_delete_connector(args.connector_id, db_session)
|
||||
|
@ -15,6 +15,7 @@ from onyx.db.engine import get_session_context_manager # noqa: E402
|
||||
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
|
||||
from onyx.db.search_settings import get_current_search_settings # noqa: E402
|
||||
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
|
||||
from onyx.db.document import get_document # noqa: E402
|
||||
|
||||
BATCH_SIZE = 100
|
||||
|
||||
@ -63,6 +64,9 @@ def main() -> None:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||
|
||||
def process_doc(doc_id: str) -> str | None:
|
||||
document = get_document(doc_id, db_session)
|
||||
if not document:
|
||||
return None
|
||||
# Check if document exists in Vespa first
|
||||
try:
|
||||
chunks = vespa_index.id_based_retrieval(
|
||||
@ -83,7 +87,9 @@ def main() -> None:
|
||||
|
||||
try:
|
||||
print(f"Deleting document {doc_id} in Vespa")
|
||||
chunks_deleted = vespa_index.delete_single(doc_id)
|
||||
chunks_deleted = vespa_index.delete_single(
|
||||
doc_id, tenant_id=None, chunk_count=document.chunk_count
|
||||
)
|
||||
if chunks_deleted > 0:
|
||||
print(
|
||||
f"Deleted {chunks_deleted} chunks for document {doc_id}"
|
||||
|
0
backend/tests/__init__.py
Normal file
0
backend/tests/__init__.py
Normal file
0
backend/tests/integration/__init__.py
Normal file
0
backend/tests/integration/__init__.py
Normal file
@ -21,8 +21,9 @@ def _verify_document_permissions(
|
||||
group_names: list[str] | None = None,
|
||||
doc_creating_user: DATestUser | None = None,
|
||||
) -> None:
|
||||
acl_keys = set(retrieved_doc["access_control_list"].keys())
|
||||
acl_keys = set(retrieved_doc.get("access_control_list", {}).keys())
|
||||
print(f"ACL keys: {acl_keys}")
|
||||
|
||||
if cc_pair.access_type == AccessType.PUBLIC:
|
||||
if "PUBLIC" not in acl_keys:
|
||||
raise ValueError(
|
||||
@ -42,8 +43,9 @@ def _verify_document_permissions(
|
||||
found_group_keys = {key for key in acl_keys if key.startswith("group:")}
|
||||
if found_group_keys != expected_group_keys:
|
||||
raise ValueError(
|
||||
f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. Found: {found_group_keys}, \n"
|
||||
f"Expected: {expected_group_keys}"
|
||||
f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. "
|
||||
f"Expected: {expected_group_keys} Found: {found_group_keys}\n"
|
||||
f"All ACL keys: {acl_keys}"
|
||||
)
|
||||
|
||||
if doc_set_names is not None:
|
||||
@ -153,9 +155,11 @@ class DocumentManager:
|
||||
) -> None:
|
||||
doc_ids = [document.id for document in cc_pair.documents]
|
||||
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
|
||||
|
||||
retrieved_docs = {
|
||||
doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict
|
||||
}
|
||||
|
||||
# Left this here for debugging purposes.
|
||||
# import json
|
||||
# for doc in retrieved_docs.values():
|
||||
|
Loading…
x
Reference in New Issue
Block a user