mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-09 20:55:06 +02:00
Skip Null Docs (#1917)
This commit is contained in:
@@ -124,6 +124,19 @@ def index_doc_batch(
|
||||
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
|
||||
Note that the documents should already be batched at this point so that it does not inflate the
|
||||
memory requirements"""
|
||||
# Skip documents that have neither title nor content
|
||||
documents_to_process = []
|
||||
for document in documents:
|
||||
if not document.title and not any(
|
||||
section.text.strip() for section in document.sections
|
||||
):
|
||||
logger.warning(
|
||||
f"Skipping document with ID {document.id} as it has neither title nor content"
|
||||
)
|
||||
else:
|
||||
documents_to_process.append(document)
|
||||
documents = documents_to_process
|
||||
|
||||
document_ids = [document.id for document in documents]
|
||||
db_docs = get_documents_by_ids(
|
||||
document_ids=document_ids,
|
||||
|
Reference in New Issue
Block a user