mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-08 21:50:12 +02:00
fix NUL character (#3540)
This commit is contained in:
@ -260,6 +260,21 @@ def index_doc_batch_prepare(
|
|||||||
def filter_documents(document_batch: list[Document]) -> list[Document]:
|
def filter_documents(document_batch: list[Document]) -> list[Document]:
|
||||||
documents: list[Document] = []
|
documents: list[Document] = []
|
||||||
for document in document_batch:
|
for document in document_batch:
|
||||||
|
# Remove any NUL characters from title/semantic_id
|
||||||
|
# This is a known issue with the Zendesk connector
|
||||||
|
# Postgres cannot handle NUL characters in text fields
|
||||||
|
if document.title:
|
||||||
|
document.title = document.title.replace("\x00", "")
|
||||||
|
if document.semantic_identifier:
|
||||||
|
document.semantic_identifier = document.semantic_identifier.replace(
|
||||||
|
"\x00", ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove NUL characters from all sections
|
||||||
|
for section in document.sections:
|
||||||
|
if section.text is not None:
|
||||||
|
section.text = section.text.replace("\x00", "")
|
||||||
|
|
||||||
empty_contents = not any(section.text.strip() for section in document.sections)
|
empty_contents = not any(section.text.strip() for section in document.sections)
|
||||||
if (
|
if (
|
||||||
(not document.title or not document.title.strip())
|
(not document.title or not document.title.strip())
|
||||||
|
Reference in New Issue
Block a user