From 02a6677e21db7958454444d2c6845ec01dd508bd Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 1 May 2023 18:11:16 -0700 Subject: [PATCH] DAN-1 Dedupe index (#6) --- backend/danswer/connectors/models.py | 2 +- backend/danswer/datastores/qdrant/indexing.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 137b32d21..1b1d562da 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -12,7 +12,7 @@ class Section: @dataclass class Document: - id: str + id: str # This must be unique or during indexing/reindexing, chunks will be overwritten sections: list[Section] source: DocumentSource metadata: dict[str, Any] diff --git a/backend/danswer/datastores/qdrant/indexing.py b/backend/danswer/datastores/qdrant/indexing.py index b1d3753b0..25651a216 100644 --- a/backend/danswer/datastores/qdrant/indexing.py +++ b/backend/danswer/datastores/qdrant/indexing.py @@ -6,7 +6,6 @@ from danswer.configs.constants import ALLOWED_USERS from danswer.configs.constants import CHUNK_ID from danswer.configs.constants import CONTENT from danswer.configs.constants import DOCUMENT_ID -from danswer.configs.constants import DocumentSource from danswer.configs.constants import SECTION_CONTINUATION from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE @@ -34,6 +33,11 @@ def recreate_collection(collection_name: str, embedding_dim: int = DOC_EMBEDDING raise RuntimeError("Could not create Qdrant collection") +def get_uuid_from_chunk(chunk: EmbeddedIndexChunk) -> uuid.UUID: + unique_identifier_string = "_".join([chunk.source_document.id, str(chunk.chunk_id)]) + return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string) + + def index_chunks( chunks: list[EmbeddedIndexChunk], collection: str, @@ -48,7 +52,7 @@ def index_chunks( document = chunk.source_document point_structs.append( PointStruct( - id=str(uuid.uuid4()), + id=str(get_uuid_from_chunk(chunk)), payload={ DOCUMENT_ID: document.id, CHUNK_ID: chunk.chunk_id,