Fix count of docs for connector failure

2025-09-29 21:37:21 +02:00 · 2023-08-13 17:24:05 -07:00
parent be318433e3
commit e0cbd087f7
4 changed files with 14 additions and 6 deletions
--- a/backend/danswer/datastores/datastore_utils.py
+++ b/backend/danswer/datastores/datastore_utils.py
@@ -8,10 +8,7 @@ from pydantic import BaseModel
 from danswer.chunking.models import EmbeddedIndexChunk
 from danswer.chunking.models import IndexChunk
 from danswer.chunking.models import InferenceChunk
 from danswer.configs.constants import ALLOWED_GROUPS
 from danswer.configs.constants import ALLOWED_USERS
 from danswer.configs.constants import PUBLIC_DOC_PAT
 from danswer.connectors.models import Document
 from danswer.connectors.models import IndexAttemptMetadata
@@ -55,7 +52,7 @@ T = TypeVar("T")
 def _add_if_not_exists(l: list[T], item: T) -> list[T]:
-    if item not in l:
+    if item in l:
        return l
    return l + [item]
--- a/backend/danswer/datastores/indexing_pipeline.py
+++ b/backend/danswer/datastores/indexing_pipeline.py
@@ -58,6 +58,9 @@ def _get_net_new_documents(
    net_new_documents = 0
    seen_documents: set[str] = set()
    for insertion_record in insertion_records:
        if insertion_record.already_existed:
            continue
        if insertion_record.document_id not in seen_documents:
            net_new_documents += 1
            seen_documents.add(insertion_record.document_id)
@@ -81,6 +84,7 @@ def _indexing_pipeline(
    keyword_store_insertion_records = keyword_index.index(
        chunks=chunks, index_attempt_metadata=index_attempt_metadata
    )
    logger.debug(f"Keyword store insertion records: {keyword_store_insertion_records}")
    _upsert_insertion_records(
        insertion_records=keyword_store_insertion_records,
        index_attempt_metadata=index_attempt_metadata,
@@ -94,6 +98,7 @@ def _indexing_pipeline(
    vector_store_insertion_records = vector_index.index(
        chunks=chunks_with_embeddings, index_attempt_metadata=index_attempt_metadata
    )
    logger.debug(f"Vector store insertion records: {keyword_store_insertion_records}")
    _upsert_insertion_records(
        insertion_records=vector_store_insertion_records,
        index_attempt_metadata=index_attempt_metadata,
--- a/backend/danswer/datastores/qdrant/indexing.py
+++ b/backend/danswer/datastores/qdrant/indexing.py
@@ -111,6 +111,8 @@ def index_qdrant_chunks(
    cross_connector_document_metadata_map: dict[
        str, CrossConnectorDocumentMetadata
    ] = {}
    # document ids of documents that existed BEFORE this indexing
    already_existing_documents: set[str] = set()
    for chunk in chunks:
        document = chunk.source_document
        (
@@ -130,6 +132,7 @@ def index_qdrant_chunks(
        if should_delete_doc:
            # Processing the first chunk of the doc and the doc exists
            delete_qdrant_doc_chunks(document.id, collection, q_client)
            already_existing_documents.add(document.id)
        for minichunk_ind, embedding in enumerate(chunk.embeddings):
            qdrant_id = str(get_uuid_from_chunk(chunk, minichunk_ind))
@@ -137,7 +140,7 @@ def index_qdrant_chunks(
                ChunkInsertionRecord(
                    document_id=document.id,
                    store_id=qdrant_id,
-                    already_existed=should_delete_doc,
+                    already_existed=document.id in already_existing_documents,
                )
            )
            point_structs.append(
--- a/backend/danswer/datastores/typesense/store.py
+++ b/backend/danswer/datastores/typesense/store.py
@@ -128,6 +128,8 @@ def index_typesense_chunks(
    cross_connector_document_metadata_map: dict[
        str, CrossConnectorDocumentMetadata
    ] = {}
    # document ids of documents that existed BEFORE this indexing
    already_existing_documents: set[str] = set()
    for chunk in chunks:
        document = chunk.source_document
        (
@@ -147,13 +149,14 @@ def index_typesense_chunks(
        if should_delete_doc:
            # Processing the first chunk of the doc and the doc exists
            delete_typesense_doc_chunks(document.id, collection, ts_client)
            already_existing_documents.add(document.id)
        typesense_id = str(get_uuid_from_chunk(chunk))
        insertion_records.append(
            ChunkInsertionRecord(
                document_id=document.id,
                store_id=typesense_id,
-                already_existed=should_delete_doc,
+                already_existed=document.id in already_existing_documents,
            )
        )
        new_documents.append(