Fix issue causing file connector to fail

This commit is contained in:
Weves
2025-02-04 19:38:57 -08:00
committed by Chris Weaver
parent 4affc259a6
commit 6889152d81
5 changed files with 47 additions and 29 deletions

View File

@@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
tenant_id: str | None, tenant_id: str | None,
large_chunk_id: int | None = None, large_chunk_id: int | None = None,
) -> UUID: ) -> UUID:
"""NOTE: be VERY carefuly about changing this function. If changed without a migration,
this can cause deletion/update/insertion to function incorrectly."""
doc_str = document_id doc_str = document_id
# Web parsing URL duplicate catching # Web parsing URL duplicate catching

View File

@@ -346,6 +346,14 @@ class VespaIndex(DocumentIndex):
# IMPORTANT: This must be done one index at a time, do not use secondary index here # IMPORTANT: This must be done one index at a time, do not use secondary index here
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks] cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
# needed so the final DocumentInsertionRecord returned can have the original document ID
new_document_id_to_original_document_id: dict[str, str] = {}
for ind, chunk in enumerate(cleaned_chunks):
old_chunk = chunks[ind]
new_document_id_to_original_document_id[
chunk.source_document.id
] = old_chunk.source_document.id
existing_docs: set[str] = set() existing_docs: set[str] = set()
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
@@ -401,14 +409,14 @@ class VespaIndex(DocumentIndex):
executor=executor, executor=executor,
) )
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks} all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
return { return {
DocumentInsertionRecord( DocumentInsertionRecord(
document_id=doc_id, document_id=new_document_id_to_original_document_id[cleaned_doc_id],
already_existed=doc_id in existing_docs, already_existed=cleaned_doc_id in existing_docs,
) )
for doc_id in all_doc_ids for cleaned_doc_id in all_cleaned_doc_ids
} }
@classmethod @classmethod
@@ -541,7 +549,7 @@ class VespaIndex(DocumentIndex):
time.monotonic() - update_start, time.monotonic() - update_start,
) )
def update_single_chunk( def _update_single_chunk(
self, self,
doc_chunk_id: UUID, doc_chunk_id: UUID,
index_name: str, index_name: str,
@@ -605,6 +613,8 @@ class VespaIndex(DocumentIndex):
""" """
doc_chunk_count = 0 doc_chunk_count = 0
doc_id = replace_invalid_doc_id_characters(doc_id)
with self.httpx_client_context as httpx_client: with self.httpx_client_context as httpx_client:
for ( for (
index_name, index_name,
@@ -627,7 +637,7 @@ class VespaIndex(DocumentIndex):
doc_chunk_count += len(doc_chunk_ids) doc_chunk_count += len(doc_chunk_ids)
for doc_chunk_id in doc_chunk_ids: for doc_chunk_id in doc_chunk_ids:
self.update_single_chunk( self._update_single_chunk(
doc_chunk_id, index_name, fields, doc_id, httpx_client doc_chunk_id, index_name, fields, doc_id, httpx_client
) )
@@ -689,6 +699,18 @@ class VespaIndex(DocumentIndex):
batch_retrieval: bool = False, batch_retrieval: bool = False,
get_large_chunks: bool = False, get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]: ) -> list[InferenceChunkUncleaned]:
# make sure to use the vespa-afied document IDs
chunk_requests = [
VespaChunkRequest(
document_id=replace_invalid_doc_id_characters(
chunk_request.document_id
),
min_chunk_ind=chunk_request.min_chunk_ind,
max_chunk_ind=chunk_request.max_chunk_ind,
)
for chunk_request in chunk_requests
]
if batch_retrieval: if batch_retrieval:
return batch_search_api_retrieval( return batch_search_api_retrieval(
index_name=self.index_name, index_name=self.index_name,

View File

@@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
def clean_chunk_id_copy( def clean_chunk_id_copy(
chunk: DocMetadataAwareIndexChunk, chunk: DocMetadataAwareIndexChunk,
) -> DocMetadataAwareIndexChunk: ) -> DocMetadataAwareIndexChunk:
clean_chunk = chunk.copy( clean_chunk = chunk.model_copy(
update={ update={
"source_document": chunk.source_document.copy( "source_document": chunk.source_document.model_copy(
update={ update={
"id": replace_invalid_doc_id_characters(chunk.source_document.id) "id": replace_invalid_doc_id_characters(chunk.source_document.id)
} }

View File

@@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:
def replace_invalid_doc_id_characters(text: str) -> str: def replace_invalid_doc_id_characters(text: str) -> str:
"""Replaces invalid document ID characters in text.""" """Replaces invalid document ID characters in text.
NOTE: this must be called at the start of every vespa-related operation or else we
risk discrepancies -> silent failures on deletion/update/insertion."""
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear # There may be a more complete set of replacements that need to be made but Vespa docs are unclear
# and users only seem to be running into this error with single quotes # and users only seem to be running into this error with single quotes
return text.replace("'", "_") return text.replace("'", "_")

View File

@@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal";
import * as Yup from "yup"; import * as Yup from "yup";
// since the uploaded files are cleaned up after some period of time
// re-indexing will not work for the file connector. Also, it would not
// make sense to re-index, since the files will not have changed.
const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File];
// synchronize these validations with the SQLAlchemy connector class until we have a // synchronize these validations with the SQLAlchemy connector class until we have a
// centralized schema for both frontend and backend // centralized schema for both frontend and backend
const RefreshFrequencySchema = Yup.object().shape({ const RefreshFrequencySchema = Yup.object().shape({
@@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) {
{ccPair.is_editable_for_current_user && ( {ccPair.is_editable_for_current_user && (
<div className="ml-auto flex gap-x-2"> <div className="ml-auto flex gap-x-2">
{!CONNECTOR_TYPES_THAT_CANT_REINDEX.includes( <ReIndexButton
ccPair.connector.source ccPairId={ccPair.id}
) && ( connectorId={ccPair.connector.id}
<ReIndexButton credentialId={ccPair.credential.id}
ccPairId={ccPair.id} isDisabled={
connectorId={ccPair.connector.id} ccPair.indexing ||
credentialId={ccPair.credential.id} ccPair.status === ConnectorCredentialPairStatus.PAUSED
isDisabled={ }
ccPair.indexing || isIndexing={ccPair.indexing}
ccPair.status === ConnectorCredentialPairStatus.PAUSED isDeleting={isDeleting}
} />
isIndexing={ccPair.indexing}
isDeleting={isDeleting}
/>
)}
{!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />} {!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
</div> </div>
)} )}