mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-07 19:38:19 +02:00
DAN-1 Dedupe index (#6)
This commit is contained in:
parent
213f29fde5
commit
02a6677e21
@ -12,7 +12,7 @@ class Section:
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
id: str
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
sections: list[Section]
|
||||
source: DocumentSource
|
||||
metadata: dict[str, Any]
|
||||
|
@ -6,7 +6,6 @@ from danswer.configs.constants import ALLOWED_USERS
|
||||
from danswer.configs.constants import CHUNK_ID
|
||||
from danswer.configs.constants import CONTENT
|
||||
from danswer.configs.constants import DOCUMENT_ID
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import SECTION_CONTINUATION
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
@ -34,6 +33,11 @@ def recreate_collection(collection_name: str, embedding_dim: int = DOC_EMBEDDING
|
||||
raise RuntimeError("Could not create Qdrant collection")
|
||||
|
||||
|
||||
def get_uuid_from_chunk(chunk: EmbeddedIndexChunk) -> uuid.UUID:
|
||||
unique_identifier_string = "_".join([chunk.source_document.id, str(chunk.chunk_id)])
|
||||
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
||||
|
||||
|
||||
def index_chunks(
|
||||
chunks: list[EmbeddedIndexChunk],
|
||||
collection: str,
|
||||
@ -48,7 +52,7 @@ def index_chunks(
|
||||
document = chunk.source_document
|
||||
point_structs.append(
|
||||
PointStruct(
|
||||
id=str(uuid.uuid4()),
|
||||
id=str(get_uuid_from_chunk(chunk)),
|
||||
payload={
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
|
Loading…
x
Reference in New Issue
Block a user