mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
Fix issue causing file connector to fail
This commit is contained in:
parent
4affc259a6
commit
6889152d81
@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
|
||||
tenant_id: str | None,
|
||||
large_chunk_id: int | None = None,
|
||||
) -> UUID:
|
||||
"""NOTE: be VERY carefuly about changing this function. If changed without a migration,
|
||||
this can cause deletion/update/insertion to function incorrectly."""
|
||||
doc_str = document_id
|
||||
|
||||
# Web parsing URL duplicate catching
|
||||
|
@ -346,6 +346,14 @@ class VespaIndex(DocumentIndex):
|
||||
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
||||
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
||||
|
||||
# needed so the final DocumentInsertionRecord returned can have the original document ID
|
||||
new_document_id_to_original_document_id: dict[str, str] = {}
|
||||
for ind, chunk in enumerate(cleaned_chunks):
|
||||
old_chunk = chunks[ind]
|
||||
new_document_id_to_original_document_id[
|
||||
chunk.source_document.id
|
||||
] = old_chunk.source_document.id
|
||||
|
||||
existing_docs: set[str] = set()
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
@ -401,14 +409,14 @@ class VespaIndex(DocumentIndex):
|
||||
executor=executor,
|
||||
)
|
||||
|
||||
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
||||
all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
||||
|
||||
return {
|
||||
DocumentInsertionRecord(
|
||||
document_id=doc_id,
|
||||
already_existed=doc_id in existing_docs,
|
||||
document_id=new_document_id_to_original_document_id[cleaned_doc_id],
|
||||
already_existed=cleaned_doc_id in existing_docs,
|
||||
)
|
||||
for doc_id in all_doc_ids
|
||||
for cleaned_doc_id in all_cleaned_doc_ids
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@ -541,7 +549,7 @@ class VespaIndex(DocumentIndex):
|
||||
time.monotonic() - update_start,
|
||||
)
|
||||
|
||||
def update_single_chunk(
|
||||
def _update_single_chunk(
|
||||
self,
|
||||
doc_chunk_id: UUID,
|
||||
index_name: str,
|
||||
@ -605,6 +613,8 @@ class VespaIndex(DocumentIndex):
|
||||
"""
|
||||
doc_chunk_count = 0
|
||||
|
||||
doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||
|
||||
with self.httpx_client_context as httpx_client:
|
||||
for (
|
||||
index_name,
|
||||
@ -627,7 +637,7 @@ class VespaIndex(DocumentIndex):
|
||||
doc_chunk_count += len(doc_chunk_ids)
|
||||
|
||||
for doc_chunk_id in doc_chunk_ids:
|
||||
self.update_single_chunk(
|
||||
self._update_single_chunk(
|
||||
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
||||
)
|
||||
|
||||
@ -689,6 +699,18 @@ class VespaIndex(DocumentIndex):
|
||||
batch_retrieval: bool = False,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
# make sure to use the vespa-afied document IDs
|
||||
chunk_requests = [
|
||||
VespaChunkRequest(
|
||||
document_id=replace_invalid_doc_id_characters(
|
||||
chunk_request.document_id
|
||||
),
|
||||
min_chunk_ind=chunk_request.min_chunk_ind,
|
||||
max_chunk_ind=chunk_request.max_chunk_ind,
|
||||
)
|
||||
for chunk_request in chunk_requests
|
||||
]
|
||||
|
||||
if batch_retrieval:
|
||||
return batch_search_api_retrieval(
|
||||
index_name=self.index_name,
|
||||
|
@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
|
||||
def clean_chunk_id_copy(
|
||||
chunk: DocMetadataAwareIndexChunk,
|
||||
) -> DocMetadataAwareIndexChunk:
|
||||
clean_chunk = chunk.copy(
|
||||
clean_chunk = chunk.model_copy(
|
||||
update={
|
||||
"source_document": chunk.source_document.copy(
|
||||
"source_document": chunk.source_document.model_copy(
|
||||
update={
|
||||
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
|
||||
}
|
||||
|
@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:
|
||||
|
||||
|
||||
def replace_invalid_doc_id_characters(text: str) -> str:
|
||||
"""Replaces invalid document ID characters in text."""
|
||||
"""Replaces invalid document ID characters in text.
|
||||
NOTE: this must be called at the start of every vespa-related operation or else we
|
||||
risk discrepancies -> silent failures on deletion/update/insertion."""
|
||||
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
||||
# and users only seem to be running into this error with single quotes
|
||||
return text.replace("'", "_")
|
||||
|
@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal";
|
||||
|
||||
import * as Yup from "yup";
|
||||
|
||||
// since the uploaded files are cleaned up after some period of time
|
||||
// re-indexing will not work for the file connector. Also, it would not
|
||||
// make sense to re-index, since the files will not have changed.
|
||||
const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File];
|
||||
|
||||
// synchronize these validations with the SQLAlchemy connector class until we have a
|
||||
// centralized schema for both frontend and backend
|
||||
const RefreshFrequencySchema = Yup.object().shape({
|
||||
@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) {
|
||||
|
||||
{ccPair.is_editable_for_current_user && (
|
||||
<div className="ml-auto flex gap-x-2">
|
||||
{!CONNECTOR_TYPES_THAT_CANT_REINDEX.includes(
|
||||
ccPair.connector.source
|
||||
) && (
|
||||
<ReIndexButton
|
||||
ccPairId={ccPair.id}
|
||||
connectorId={ccPair.connector.id}
|
||||
credentialId={ccPair.credential.id}
|
||||
isDisabled={
|
||||
ccPair.indexing ||
|
||||
ccPair.status === ConnectorCredentialPairStatus.PAUSED
|
||||
}
|
||||
isIndexing={ccPair.indexing}
|
||||
isDeleting={isDeleting}
|
||||
/>
|
||||
)}
|
||||
<ReIndexButton
|
||||
ccPairId={ccPair.id}
|
||||
connectorId={ccPair.connector.id}
|
||||
credentialId={ccPair.credential.id}
|
||||
isDisabled={
|
||||
ccPair.indexing ||
|
||||
ccPair.status === ConnectorCredentialPairStatus.PAUSED
|
||||
}
|
||||
isIndexing={ccPair.indexing}
|
||||
isDeleting={isDeleting}
|
||||
/>
|
||||
|
||||
{!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
|
||||
</div>
|
||||
)}
|
||||
|
Loading…
x
Reference in New Issue
Block a user