mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
Skip Null Docs (#1917)
This commit is contained in:
@@ -124,6 +124,19 @@ def index_doc_batch(
|
|||||||
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
|
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
|
||||||
Note that the documents should already be batched at this point so that it does not inflate the
|
Note that the documents should already be batched at this point so that it does not inflate the
|
||||||
memory requirements"""
|
memory requirements"""
|
||||||
|
# Skip documents that have neither title nor content
|
||||||
|
documents_to_process = []
|
||||||
|
for document in documents:
|
||||||
|
if not document.title and not any(
|
||||||
|
section.text.strip() for section in document.sections
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"Skipping document with ID {document.id} as it has neither title nor content"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
documents_to_process.append(document)
|
||||||
|
documents = documents_to_process
|
||||||
|
|
||||||
document_ids = [document.id for document in documents]
|
document_ids = [document.id for document in documents]
|
||||||
db_docs = get_documents_by_ids(
|
db_docs = get_documents_by_ids(
|
||||||
document_ids=document_ids,
|
document_ids=document_ids,
|
||||||
|
Reference in New Issue
Block a user