Skip Null Docs (#1917)

This commit is contained in:
Yuhong Sun
2024-07-24 11:31:33 -07:00
committed by GitHub
parent 9651ea828b
commit 91be8e7bfb

View File

@@ -124,6 +124,19 @@ def index_doc_batch(
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents """Takes different pieces of the indexing pipeline and applies it to a batch of documents
Note that the documents should already be batched at this point so that it does not inflate the Note that the documents should already be batched at this point so that it does not inflate the
memory requirements""" memory requirements"""
# Skip documents that have neither title nor content
documents_to_process = []
for document in documents:
if not document.title and not any(
section.text.strip() for section in document.sections
):
logger.warning(
f"Skipping document with ID {document.id} as it has neither title nor content"
)
else:
documents_to_process.append(document)
documents = documents_to_process
document_ids = [document.id for document in documents] document_ids = [document.id for document in documents]
db_docs = get_documents_by_ids( db_docs = get_documents_by_ids(
document_ids=document_ids, document_ids=document_ids,