mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-29 11:12:02 +01:00
Ingestion API now always updates regardless of document updated_at (#786)
This commit is contained in:
parent
9b7069a043
commit
006fd4c438
@ -69,6 +69,7 @@ def _indexing_pipeline(
|
||||
document_index: DocumentIndex,
|
||||
documents: list[Document],
|
||||
index_attempt_metadata: IndexAttemptMetadata,
|
||||
ignore_time_skip: bool = False,
|
||||
) -> tuple[int, int]:
|
||||
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
|
||||
Note that the documents should already be batched at this point so that it does not inflate the
|
||||
@ -87,14 +88,18 @@ def _indexing_pipeline(
|
||||
}
|
||||
|
||||
updatable_docs: list[Document] = []
|
||||
for doc in documents:
|
||||
if (
|
||||
doc.id in id_update_time_map
|
||||
and doc.doc_updated_at
|
||||
and doc.doc_updated_at <= id_update_time_map[doc.id]
|
||||
):
|
||||
continue
|
||||
updatable_docs.append(doc)
|
||||
if ignore_time_skip:
|
||||
updatable_docs = documents
|
||||
else:
|
||||
for doc in documents:
|
||||
if (
|
||||
doc.id in id_update_time_map
|
||||
and doc.doc_updated_at
|
||||
and doc.doc_updated_at <= id_update_time_map[doc.id]
|
||||
):
|
||||
continue
|
||||
updatable_docs.append(doc)
|
||||
|
||||
updatable_ids = [doc.id for doc in updatable_docs]
|
||||
|
||||
# Acquires a lock on the documents so that no other process can modify them
|
||||
@ -175,6 +180,7 @@ def build_indexing_pipeline(
|
||||
chunker: Chunker | None = None,
|
||||
embedder: Embedder | None = None,
|
||||
document_index: DocumentIndex | None = None,
|
||||
ignore_time_skip: bool = False,
|
||||
) -> IndexingPipelineProtocol:
|
||||
"""Builds a pipline which takes in a list (batch) of docs and indexes them."""
|
||||
chunker = chunker or DefaultChunker()
|
||||
@ -188,4 +194,5 @@ def build_indexing_pipeline(
|
||||
chunker=chunker,
|
||||
embedder=embedder,
|
||||
document_index=document_index,
|
||||
ignore_time_skip=ignore_time_skip,
|
||||
)
|
||||
|
@ -141,7 +141,7 @@ def document_ingestion(
|
||||
if document.source == DocumentSource.INGESTION_API:
|
||||
document.source = DocumentSource.FILE
|
||||
|
||||
indexing_pipeline = build_indexing_pipeline()
|
||||
indexing_pipeline = build_indexing_pipeline(ignore_time_skip=True)
|
||||
|
||||
new_doc, chunks = indexing_pipeline(
|
||||
documents=[document],
|
||||
|
Loading…
x
Reference in New Issue
Block a user