mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-10 12:59:59 +02:00
Fix count of docs for connector failure
This commit is contained in:
parent
be318433e3
commit
e0cbd087f7
@ -8,10 +8,7 @@ from pydantic import BaseModel
|
||||
from danswer.chunking.models import EmbeddedIndexChunk
|
||||
from danswer.chunking.models import IndexChunk
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.constants import ALLOWED_GROUPS
|
||||
from danswer.configs.constants import ALLOWED_USERS
|
||||
from danswer.configs.constants import PUBLIC_DOC_PAT
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import IndexAttemptMetadata
|
||||
|
||||
|
||||
@ -55,7 +52,7 @@ T = TypeVar("T")
|
||||
|
||||
|
||||
def _add_if_not_exists(l: list[T], item: T) -> list[T]:
|
||||
if item not in l:
|
||||
if item in l:
|
||||
return l
|
||||
return l + [item]
|
||||
|
||||
|
@ -58,6 +58,9 @@ def _get_net_new_documents(
|
||||
net_new_documents = 0
|
||||
seen_documents: set[str] = set()
|
||||
for insertion_record in insertion_records:
|
||||
if insertion_record.already_existed:
|
||||
continue
|
||||
|
||||
if insertion_record.document_id not in seen_documents:
|
||||
net_new_documents += 1
|
||||
seen_documents.add(insertion_record.document_id)
|
||||
@ -81,6 +84,7 @@ def _indexing_pipeline(
|
||||
keyword_store_insertion_records = keyword_index.index(
|
||||
chunks=chunks, index_attempt_metadata=index_attempt_metadata
|
||||
)
|
||||
logger.debug(f"Keyword store insertion records: {keyword_store_insertion_records}")
|
||||
_upsert_insertion_records(
|
||||
insertion_records=keyword_store_insertion_records,
|
||||
index_attempt_metadata=index_attempt_metadata,
|
||||
@ -94,6 +98,7 @@ def _indexing_pipeline(
|
||||
vector_store_insertion_records = vector_index.index(
|
||||
chunks=chunks_with_embeddings, index_attempt_metadata=index_attempt_metadata
|
||||
)
|
||||
logger.debug(f"Vector store insertion records: {keyword_store_insertion_records}")
|
||||
_upsert_insertion_records(
|
||||
insertion_records=vector_store_insertion_records,
|
||||
index_attempt_metadata=index_attempt_metadata,
|
||||
|
@ -111,6 +111,8 @@ def index_qdrant_chunks(
|
||||
cross_connector_document_metadata_map: dict[
|
||||
str, CrossConnectorDocumentMetadata
|
||||
] = {}
|
||||
# document ids of documents that existed BEFORE this indexing
|
||||
already_existing_documents: set[str] = set()
|
||||
for chunk in chunks:
|
||||
document = chunk.source_document
|
||||
(
|
||||
@ -130,6 +132,7 @@ def index_qdrant_chunks(
|
||||
if should_delete_doc:
|
||||
# Processing the first chunk of the doc and the doc exists
|
||||
delete_qdrant_doc_chunks(document.id, collection, q_client)
|
||||
already_existing_documents.add(document.id)
|
||||
|
||||
for minichunk_ind, embedding in enumerate(chunk.embeddings):
|
||||
qdrant_id = str(get_uuid_from_chunk(chunk, minichunk_ind))
|
||||
@ -137,7 +140,7 @@ def index_qdrant_chunks(
|
||||
ChunkInsertionRecord(
|
||||
document_id=document.id,
|
||||
store_id=qdrant_id,
|
||||
already_existed=should_delete_doc,
|
||||
already_existed=document.id in already_existing_documents,
|
||||
)
|
||||
)
|
||||
point_structs.append(
|
||||
|
@ -128,6 +128,8 @@ def index_typesense_chunks(
|
||||
cross_connector_document_metadata_map: dict[
|
||||
str, CrossConnectorDocumentMetadata
|
||||
] = {}
|
||||
# document ids of documents that existed BEFORE this indexing
|
||||
already_existing_documents: set[str] = set()
|
||||
for chunk in chunks:
|
||||
document = chunk.source_document
|
||||
(
|
||||
@ -147,13 +149,14 @@ def index_typesense_chunks(
|
||||
if should_delete_doc:
|
||||
# Processing the first chunk of the doc and the doc exists
|
||||
delete_typesense_doc_chunks(document.id, collection, ts_client)
|
||||
already_existing_documents.add(document.id)
|
||||
|
||||
typesense_id = str(get_uuid_from_chunk(chunk))
|
||||
insertion_records.append(
|
||||
ChunkInsertionRecord(
|
||||
document_id=document.id,
|
||||
store_id=typesense_id,
|
||||
already_existed=should_delete_doc,
|
||||
already_existed=document.id in already_existing_documents,
|
||||
)
|
||||
)
|
||||
new_documents.append(
|
||||
|
Loading…
x
Reference in New Issue
Block a user