diff --git a/backend/danswer/datastores/datastore_utils.py b/backend/danswer/datastores/datastore_utils.py index 6121ad899..9359562ab 100644 --- a/backend/danswer/datastores/datastore_utils.py +++ b/backend/danswer/datastores/datastore_utils.py @@ -8,10 +8,7 @@ from pydantic import BaseModel from danswer.chunking.models import EmbeddedIndexChunk from danswer.chunking.models import IndexChunk from danswer.chunking.models import InferenceChunk -from danswer.configs.constants import ALLOWED_GROUPS -from danswer.configs.constants import ALLOWED_USERS from danswer.configs.constants import PUBLIC_DOC_PAT -from danswer.connectors.models import Document from danswer.connectors.models import IndexAttemptMetadata @@ -55,7 +52,7 @@ T = TypeVar("T") def _add_if_not_exists(l: list[T], item: T) -> list[T]: - if item not in l: + if item in l: return l return l + [item] diff --git a/backend/danswer/datastores/indexing_pipeline.py b/backend/danswer/datastores/indexing_pipeline.py index d0bb79d83..c3a0be0d9 100644 --- a/backend/danswer/datastores/indexing_pipeline.py +++ b/backend/danswer/datastores/indexing_pipeline.py @@ -58,6 +58,9 @@ def _get_net_new_documents( net_new_documents = 0 seen_documents: set[str] = set() for insertion_record in insertion_records: + if insertion_record.already_existed: + continue + if insertion_record.document_id not in seen_documents: net_new_documents += 1 seen_documents.add(insertion_record.document_id) @@ -81,6 +84,7 @@ def _indexing_pipeline( keyword_store_insertion_records = keyword_index.index( chunks=chunks, index_attempt_metadata=index_attempt_metadata ) + logger.debug(f"Keyword store insertion records: {keyword_store_insertion_records}") _upsert_insertion_records( insertion_records=keyword_store_insertion_records, index_attempt_metadata=index_attempt_metadata, @@ -94,6 +98,7 @@ def _indexing_pipeline( vector_store_insertion_records = vector_index.index( chunks=chunks_with_embeddings, index_attempt_metadata=index_attempt_metadata ) + logger.debug(f"Vector store insertion records: {keyword_store_insertion_records}") _upsert_insertion_records( insertion_records=vector_store_insertion_records, index_attempt_metadata=index_attempt_metadata, diff --git a/backend/danswer/datastores/qdrant/indexing.py b/backend/danswer/datastores/qdrant/indexing.py index 5e60d5650..5d0f445e1 100644 --- a/backend/danswer/datastores/qdrant/indexing.py +++ b/backend/danswer/datastores/qdrant/indexing.py @@ -111,6 +111,8 @@ def index_qdrant_chunks( cross_connector_document_metadata_map: dict[ str, CrossConnectorDocumentMetadata ] = {} + # document ids of documents that existed BEFORE this indexing + already_existing_documents: set[str] = set() for chunk in chunks: document = chunk.source_document ( @@ -130,6 +132,7 @@ def index_qdrant_chunks( if should_delete_doc: # Processing the first chunk of the doc and the doc exists delete_qdrant_doc_chunks(document.id, collection, q_client) + already_existing_documents.add(document.id) for minichunk_ind, embedding in enumerate(chunk.embeddings): qdrant_id = str(get_uuid_from_chunk(chunk, minichunk_ind)) @@ -137,7 +140,7 @@ def index_qdrant_chunks( ChunkInsertionRecord( document_id=document.id, store_id=qdrant_id, - already_existed=should_delete_doc, + already_existed=document.id in already_existing_documents, ) ) point_structs.append( diff --git a/backend/danswer/datastores/typesense/store.py b/backend/danswer/datastores/typesense/store.py index d47492e65..abd87d3df 100644 --- a/backend/danswer/datastores/typesense/store.py +++ b/backend/danswer/datastores/typesense/store.py @@ -128,6 +128,8 @@ def index_typesense_chunks( cross_connector_document_metadata_map: dict[ str, CrossConnectorDocumentMetadata ] = {} + # document ids of documents that existed BEFORE this indexing + already_existing_documents: set[str] = set() for chunk in chunks: document = chunk.source_document ( @@ -147,13 +149,14 @@ def index_typesense_chunks( if should_delete_doc: # Processing the first chunk of the doc and the doc exists delete_typesense_doc_chunks(document.id, collection, ts_client) + already_existing_documents.add(document.id) typesense_id = str(get_uuid_from_chunk(chunk)) insertion_records.append( ChunkInsertionRecord( document_id=document.id, store_id=typesense_id, - already_existed=should_delete_doc, + already_existed=document.id in already_existing_documents, ) ) new_documents.append(