mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-26 11:58:28 +02:00
Fix issue causing file connector to fail
This commit is contained in:
@@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
|
|||||||
tenant_id: str | None,
|
tenant_id: str | None,
|
||||||
large_chunk_id: int | None = None,
|
large_chunk_id: int | None = None,
|
||||||
) -> UUID:
|
) -> UUID:
|
||||||
|
"""NOTE: be VERY carefuly about changing this function. If changed without a migration,
|
||||||
|
this can cause deletion/update/insertion to function incorrectly."""
|
||||||
doc_str = document_id
|
doc_str = document_id
|
||||||
|
|
||||||
# Web parsing URL duplicate catching
|
# Web parsing URL duplicate catching
|
||||||
|
@@ -346,6 +346,14 @@ class VespaIndex(DocumentIndex):
|
|||||||
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
||||||
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
||||||
|
|
||||||
|
# needed so the final DocumentInsertionRecord returned can have the original document ID
|
||||||
|
new_document_id_to_original_document_id: dict[str, str] = {}
|
||||||
|
for ind, chunk in enumerate(cleaned_chunks):
|
||||||
|
old_chunk = chunks[ind]
|
||||||
|
new_document_id_to_original_document_id[
|
||||||
|
chunk.source_document.id
|
||||||
|
] = old_chunk.source_document.id
|
||||||
|
|
||||||
existing_docs: set[str] = set()
|
existing_docs: set[str] = set()
|
||||||
|
|
||||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||||
@@ -401,14 +409,14 @@ class VespaIndex(DocumentIndex):
|
|||||||
executor=executor,
|
executor=executor,
|
||||||
)
|
)
|
||||||
|
|
||||||
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
DocumentInsertionRecord(
|
DocumentInsertionRecord(
|
||||||
document_id=doc_id,
|
document_id=new_document_id_to_original_document_id[cleaned_doc_id],
|
||||||
already_existed=doc_id in existing_docs,
|
already_existed=cleaned_doc_id in existing_docs,
|
||||||
)
|
)
|
||||||
for doc_id in all_doc_ids
|
for cleaned_doc_id in all_cleaned_doc_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -541,7 +549,7 @@ class VespaIndex(DocumentIndex):
|
|||||||
time.monotonic() - update_start,
|
time.monotonic() - update_start,
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_single_chunk(
|
def _update_single_chunk(
|
||||||
self,
|
self,
|
||||||
doc_chunk_id: UUID,
|
doc_chunk_id: UUID,
|
||||||
index_name: str,
|
index_name: str,
|
||||||
@@ -605,6 +613,8 @@ class VespaIndex(DocumentIndex):
|
|||||||
"""
|
"""
|
||||||
doc_chunk_count = 0
|
doc_chunk_count = 0
|
||||||
|
|
||||||
|
doc_id = replace_invalid_doc_id_characters(doc_id)
|
||||||
|
|
||||||
with self.httpx_client_context as httpx_client:
|
with self.httpx_client_context as httpx_client:
|
||||||
for (
|
for (
|
||||||
index_name,
|
index_name,
|
||||||
@@ -627,7 +637,7 @@ class VespaIndex(DocumentIndex):
|
|||||||
doc_chunk_count += len(doc_chunk_ids)
|
doc_chunk_count += len(doc_chunk_ids)
|
||||||
|
|
||||||
for doc_chunk_id in doc_chunk_ids:
|
for doc_chunk_id in doc_chunk_ids:
|
||||||
self.update_single_chunk(
|
self._update_single_chunk(
|
||||||
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -689,6 +699,18 @@ class VespaIndex(DocumentIndex):
|
|||||||
batch_retrieval: bool = False,
|
batch_retrieval: bool = False,
|
||||||
get_large_chunks: bool = False,
|
get_large_chunks: bool = False,
|
||||||
) -> list[InferenceChunkUncleaned]:
|
) -> list[InferenceChunkUncleaned]:
|
||||||
|
# make sure to use the vespa-afied document IDs
|
||||||
|
chunk_requests = [
|
||||||
|
VespaChunkRequest(
|
||||||
|
document_id=replace_invalid_doc_id_characters(
|
||||||
|
chunk_request.document_id
|
||||||
|
),
|
||||||
|
min_chunk_ind=chunk_request.min_chunk_ind,
|
||||||
|
max_chunk_ind=chunk_request.max_chunk_ind,
|
||||||
|
)
|
||||||
|
for chunk_request in chunk_requests
|
||||||
|
]
|
||||||
|
|
||||||
if batch_retrieval:
|
if batch_retrieval:
|
||||||
return batch_search_api_retrieval(
|
return batch_search_api_retrieval(
|
||||||
index_name=self.index_name,
|
index_name=self.index_name,
|
||||||
|
@@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
|
|||||||
def clean_chunk_id_copy(
|
def clean_chunk_id_copy(
|
||||||
chunk: DocMetadataAwareIndexChunk,
|
chunk: DocMetadataAwareIndexChunk,
|
||||||
) -> DocMetadataAwareIndexChunk:
|
) -> DocMetadataAwareIndexChunk:
|
||||||
clean_chunk = chunk.copy(
|
clean_chunk = chunk.model_copy(
|
||||||
update={
|
update={
|
||||||
"source_document": chunk.source_document.copy(
|
"source_document": chunk.source_document.model_copy(
|
||||||
update={
|
update={
|
||||||
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
|
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
|
||||||
}
|
}
|
||||||
|
@@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def replace_invalid_doc_id_characters(text: str) -> str:
|
def replace_invalid_doc_id_characters(text: str) -> str:
|
||||||
"""Replaces invalid document ID characters in text."""
|
"""Replaces invalid document ID characters in text.
|
||||||
|
NOTE: this must be called at the start of every vespa-related operation or else we
|
||||||
|
risk discrepancies -> silent failures on deletion/update/insertion."""
|
||||||
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
||||||
# and users only seem to be running into this error with single quotes
|
# and users only seem to be running into this error with single quotes
|
||||||
return text.replace("'", "_")
|
return text.replace("'", "_")
|
||||||
|
@@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal";
|
|||||||
|
|
||||||
import * as Yup from "yup";
|
import * as Yup from "yup";
|
||||||
|
|
||||||
// since the uploaded files are cleaned up after some period of time
|
|
||||||
// re-indexing will not work for the file connector. Also, it would not
|
|
||||||
// make sense to re-index, since the files will not have changed.
|
|
||||||
const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File];
|
|
||||||
|
|
||||||
// synchronize these validations with the SQLAlchemy connector class until we have a
|
// synchronize these validations with the SQLAlchemy connector class until we have a
|
||||||
// centralized schema for both frontend and backend
|
// centralized schema for both frontend and backend
|
||||||
const RefreshFrequencySchema = Yup.object().shape({
|
const RefreshFrequencySchema = Yup.object().shape({
|
||||||
@@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) {
|
|||||||
|
|
||||||
{ccPair.is_editable_for_current_user && (
|
{ccPair.is_editable_for_current_user && (
|
||||||
<div className="ml-auto flex gap-x-2">
|
<div className="ml-auto flex gap-x-2">
|
||||||
{!CONNECTOR_TYPES_THAT_CANT_REINDEX.includes(
|
<ReIndexButton
|
||||||
ccPair.connector.source
|
ccPairId={ccPair.id}
|
||||||
) && (
|
connectorId={ccPair.connector.id}
|
||||||
<ReIndexButton
|
credentialId={ccPair.credential.id}
|
||||||
ccPairId={ccPair.id}
|
isDisabled={
|
||||||
connectorId={ccPair.connector.id}
|
ccPair.indexing ||
|
||||||
credentialId={ccPair.credential.id}
|
ccPair.status === ConnectorCredentialPairStatus.PAUSED
|
||||||
isDisabled={
|
}
|
||||||
ccPair.indexing ||
|
isIndexing={ccPair.indexing}
|
||||||
ccPair.status === ConnectorCredentialPairStatus.PAUSED
|
isDeleting={isDeleting}
|
||||||
}
|
/>
|
||||||
isIndexing={ccPair.indexing}
|
|
||||||
isDeleting={isDeleting}
|
|
||||||
/>
|
|
||||||
)}
|
|
||||||
{!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
|
{!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
Reference in New Issue
Block a user