Fix count of docs for connector failure

2025-04-10 12:59:59 +02:00 · 2023-08-13 17:24:05 -07:00 · 2023-08-13 17:24:05 -07:00 · e0cbd087f7
commit e0cbd087f7
parent be318433e3
4 changed files with 14 additions and 6 deletions
--- a/backend/danswer/datastores/datastore_utils.py
+++ b/backend/danswer/datastores/datastore_utils.py
@ -8,10 +8,7 @@ from pydantic import BaseModel
 from danswer.chunking.models import EmbeddedIndexChunk
 from danswer.chunking.models import IndexChunk
 from danswer.chunking.models import InferenceChunk
-from danswer.configs.constants import ALLOWED_GROUPS
-from danswer.configs.constants import ALLOWED_USERS
 from danswer.configs.constants import PUBLIC_DOC_PAT
-from danswer.connectors.models import Document
 from danswer.connectors.models import IndexAttemptMetadata


@ -55,7 +52,7 @@ T = TypeVar("T")


 def _add_if_not_exists(l: list[T], item: T) -> list[T]:
-    if item not in l:
+    if item in l:
        return l
    return l + [item]

--- a/backend/danswer/datastores/indexing_pipeline.py
+++ b/backend/danswer/datastores/indexing_pipeline.py
@ -58,6 +58,9 @@ def _get_net_new_documents(
    net_new_documents = 0
    seen_documents: set[str] = set()
    for insertion_record in insertion_records:
+        if insertion_record.already_existed:
+            continue
+
        if insertion_record.document_id not in seen_documents:
            net_new_documents += 1
            seen_documents.add(insertion_record.document_id)
@ -81,6 +84,7 @@ def _indexing_pipeline(
    keyword_store_insertion_records = keyword_index.index(
        chunks=chunks, index_attempt_metadata=index_attempt_metadata
    )
+    logger.debug(f"Keyword store insertion records: {keyword_store_insertion_records}")
    _upsert_insertion_records(
        insertion_records=keyword_store_insertion_records,
        index_attempt_metadata=index_attempt_metadata,
@ -94,6 +98,7 @@ def _indexing_pipeline(
    vector_store_insertion_records = vector_index.index(
        chunks=chunks_with_embeddings, index_attempt_metadata=index_attempt_metadata
    )
+    logger.debug(f"Vector store insertion records: {keyword_store_insertion_records}")
    _upsert_insertion_records(
        insertion_records=vector_store_insertion_records,
        index_attempt_metadata=index_attempt_metadata,
--- a/backend/danswer/datastores/qdrant/indexing.py
+++ b/backend/danswer/datastores/qdrant/indexing.py
@ -111,6 +111,8 @@ def index_qdrant_chunks(
    cross_connector_document_metadata_map: dict[
        str, CrossConnectorDocumentMetadata
    ] = {}
+    # document ids of documents that existed BEFORE this indexing
+    already_existing_documents: set[str] = set()
    for chunk in chunks:
        document = chunk.source_document
        (
@ -130,6 +132,7 @@ def index_qdrant_chunks(
        if should_delete_doc:
            # Processing the first chunk of the doc and the doc exists
            delete_qdrant_doc_chunks(document.id, collection, q_client)
+            already_existing_documents.add(document.id)

        for minichunk_ind, embedding in enumerate(chunk.embeddings):
            qdrant_id = str(get_uuid_from_chunk(chunk, minichunk_ind))
@ -137,7 +140,7 @@ def index_qdrant_chunks(
                ChunkInsertionRecord(
                    document_id=document.id,
                    store_id=qdrant_id,
-                    already_existed=should_delete_doc,
+                    already_existed=document.id in already_existing_documents,
                )
            )
            point_structs.append(
--- a/backend/danswer/datastores/typesense/store.py
+++ b/backend/danswer/datastores/typesense/store.py
@ -128,6 +128,8 @@ def index_typesense_chunks(
    cross_connector_document_metadata_map: dict[
        str, CrossConnectorDocumentMetadata
    ] = {}
+    # document ids of documents that existed BEFORE this indexing
+    already_existing_documents: set[str] = set()
    for chunk in chunks:
        document = chunk.source_document
        (
@ -147,13 +149,14 @@ def index_typesense_chunks(
        if should_delete_doc:
            # Processing the first chunk of the doc and the doc exists
            delete_typesense_doc_chunks(document.id, collection, ts_client)
+            already_existing_documents.add(document.id)

        typesense_id = str(get_uuid_from_chunk(chunk))
        insertion_records.append(
            ChunkInsertionRecord(
                document_id=document.id,
                store_id=typesense_id,
-                already_existed=should_delete_doc,
+                already_existed=document.id in already_existing_documents,
            )
        )
        new_documents.append(