mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-11 00:20:55 +02:00
parent
e2700b2bbd
commit
f4806da653
@ -14,6 +14,7 @@ from onyx.configs.app_configs import POLL_CONNECTOR_OFFSET
|
|||||||
from onyx.configs.constants import MilestoneRecordType
|
from onyx.configs.constants import MilestoneRecordType
|
||||||
from onyx.connectors.connector_runner import ConnectorRunner
|
from onyx.connectors.connector_runner import ConnectorRunner
|
||||||
from onyx.connectors.factory import instantiate_connector
|
from onyx.connectors.factory import instantiate_connector
|
||||||
|
from onyx.connectors.models import Document
|
||||||
from onyx.connectors.models import IndexAttemptMetadata
|
from onyx.connectors.models import IndexAttemptMetadata
|
||||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||||
from onyx.db.connector_credential_pair import get_last_successful_attempt_time
|
from onyx.db.connector_credential_pair import get_last_successful_attempt_time
|
||||||
@ -90,6 +91,35 @@ def _get_connector_runner(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
|
||||||
|
cleaned_batch = []
|
||||||
|
for doc in doc_batch:
|
||||||
|
cleaned_doc = doc.model_copy()
|
||||||
|
|
||||||
|
if "\x00" in cleaned_doc.id:
|
||||||
|
logger.warning(f"NUL characters found in document ID: {cleaned_doc.id}")
|
||||||
|
cleaned_doc.id = cleaned_doc.id.replace("\x00", "")
|
||||||
|
|
||||||
|
if "\x00" in cleaned_doc.semantic_identifier:
|
||||||
|
logger.warning(
|
||||||
|
f"NUL characters found in document semantic identifier: {cleaned_doc.semantic_identifier}"
|
||||||
|
)
|
||||||
|
cleaned_doc.semantic_identifier = cleaned_doc.semantic_identifier.replace(
|
||||||
|
"\x00", ""
|
||||||
|
)
|
||||||
|
|
||||||
|
for section in cleaned_doc.sections:
|
||||||
|
if section.link and "\x00" in section.link:
|
||||||
|
logger.warning(
|
||||||
|
f"NUL characters found in document link for document: {cleaned_doc.id}"
|
||||||
|
)
|
||||||
|
section.link = section.link.replace("\x00", "")
|
||||||
|
|
||||||
|
cleaned_batch.append(cleaned_doc)
|
||||||
|
|
||||||
|
return cleaned_batch
|
||||||
|
|
||||||
|
|
||||||
class ConnectorStopSignal(Exception):
|
class ConnectorStopSignal(Exception):
|
||||||
"""A custom exception used to signal a stop in processing."""
|
"""A custom exception used to signal a stop in processing."""
|
||||||
|
|
||||||
@ -238,7 +268,9 @@ def _run_indexing(
|
|||||||
)
|
)
|
||||||
|
|
||||||
batch_description = []
|
batch_description = []
|
||||||
for doc in doc_batch:
|
|
||||||
|
doc_batch_cleaned = strip_null_characters(doc_batch)
|
||||||
|
for doc in doc_batch_cleaned:
|
||||||
batch_description.append(doc.to_short_descriptor())
|
batch_description.append(doc.to_short_descriptor())
|
||||||
|
|
||||||
doc_size = 0
|
doc_size = 0
|
||||||
@ -258,15 +290,15 @@ def _run_indexing(
|
|||||||
|
|
||||||
# real work happens here!
|
# real work happens here!
|
||||||
new_docs, total_batch_chunks = indexing_pipeline(
|
new_docs, total_batch_chunks = indexing_pipeline(
|
||||||
document_batch=doc_batch,
|
document_batch=doc_batch_cleaned,
|
||||||
index_attempt_metadata=index_attempt_md,
|
index_attempt_metadata=index_attempt_md,
|
||||||
)
|
)
|
||||||
|
|
||||||
batch_num += 1
|
batch_num += 1
|
||||||
net_doc_change += new_docs
|
net_doc_change += new_docs
|
||||||
chunk_count += total_batch_chunks
|
chunk_count += total_batch_chunks
|
||||||
document_count += len(doc_batch)
|
document_count += len(doc_batch_cleaned)
|
||||||
all_connector_doc_ids.update(doc.id for doc in doc_batch)
|
all_connector_doc_ids.update(doc.id for doc in doc_batch_cleaned)
|
||||||
|
|
||||||
# commit transaction so that the `update` below begins
|
# commit transaction so that the `update` below begins
|
||||||
# with a brand new transaction. Postgres uses the start
|
# with a brand new transaction. Postgres uses the start
|
||||||
@ -276,7 +308,7 @@ def _run_indexing(
|
|||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
|
||||||
if callback:
|
if callback:
|
||||||
callback.progress("_run_indexing", len(doc_batch))
|
callback.progress("_run_indexing", len(doc_batch_cleaned))
|
||||||
|
|
||||||
# This new value is updated every batch, so UI can refresh per batch update
|
# This new value is updated every batch, so UI can refresh per batch update
|
||||||
update_docs_indexed(
|
update_docs_indexed(
|
||||||
|
@ -54,6 +54,7 @@ from onyx.db.enums import IndexingStatus
|
|||||||
from onyx.db.enums import IndexModelStatus
|
from onyx.db.enums import IndexModelStatus
|
||||||
from onyx.db.enums import TaskStatus
|
from onyx.db.enums import TaskStatus
|
||||||
from onyx.db.pydantic_type import PydanticType
|
from onyx.db.pydantic_type import PydanticType
|
||||||
|
from onyx.utils.logger import setup_logger
|
||||||
from onyx.utils.special_types import JSON_ro
|
from onyx.utils.special_types import JSON_ro
|
||||||
from onyx.file_store.models import FileDescriptor
|
from onyx.file_store.models import FileDescriptor
|
||||||
from onyx.llm.override_models import LLMOverride
|
from onyx.llm.override_models import LLMOverride
|
||||||
@ -65,6 +66,8 @@ from onyx.utils.headers import HeaderItemDict
|
|||||||
from shared_configs.enums import EmbeddingProvider
|
from shared_configs.enums import EmbeddingProvider
|
||||||
from shared_configs.enums import RerankerProvider
|
from shared_configs.enums import RerankerProvider
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
class Base(DeclarativeBase):
|
class Base(DeclarativeBase):
|
||||||
__abstract__ = True
|
__abstract__ = True
|
||||||
@ -72,6 +75,8 @@ class Base(DeclarativeBase):
|
|||||||
|
|
||||||
class EncryptedString(TypeDecorator):
|
class EncryptedString(TypeDecorator):
|
||||||
impl = LargeBinary
|
impl = LargeBinary
|
||||||
|
# This type's behavior is fully deterministic and doesn't depend on any external factors.
|
||||||
|
cache_ok = True
|
||||||
|
|
||||||
def process_bind_param(self, value: str | None, dialect: Dialect) -> bytes | None:
|
def process_bind_param(self, value: str | None, dialect: Dialect) -> bytes | None:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
@ -86,6 +91,8 @@ class EncryptedString(TypeDecorator):
|
|||||||
|
|
||||||
class EncryptedJson(TypeDecorator):
|
class EncryptedJson(TypeDecorator):
|
||||||
impl = LargeBinary
|
impl = LargeBinary
|
||||||
|
# This type's behavior is fully deterministic and doesn't depend on any external factors.
|
||||||
|
cache_ok = True
|
||||||
|
|
||||||
def process_bind_param(self, value: dict | None, dialect: Dialect) -> bytes | None:
|
def process_bind_param(self, value: dict | None, dialect: Dialect) -> bytes | None:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
@ -102,6 +109,21 @@ class EncryptedJson(TypeDecorator):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
class NullFilteredString(TypeDecorator):
|
||||||
|
impl = String
|
||||||
|
# This type's behavior is fully deterministic and doesn't depend on any external factors.
|
||||||
|
cache_ok = True
|
||||||
|
|
||||||
|
def process_bind_param(self, value: str | None, dialect: Dialect) -> str | None:
|
||||||
|
if value is not None and "\x00" in value:
|
||||||
|
logger.warning(f"NUL characters found in value: {value}")
|
||||||
|
return value.replace("\x00", "")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def process_result_value(self, value: str | None, dialect: Dialect) -> str | None:
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Auth/Authz (users, permissions, access) Tables
|
Auth/Authz (users, permissions, access) Tables
|
||||||
"""
|
"""
|
||||||
@ -451,16 +473,16 @@ class Document(Base):
|
|||||||
|
|
||||||
# this should correspond to the ID of the document
|
# this should correspond to the ID of the document
|
||||||
# (as is passed around in Onyx)
|
# (as is passed around in Onyx)
|
||||||
id: Mapped[str] = mapped_column(String, primary_key=True)
|
id: Mapped[str] = mapped_column(NullFilteredString, primary_key=True)
|
||||||
from_ingestion_api: Mapped[bool] = mapped_column(
|
from_ingestion_api: Mapped[bool] = mapped_column(
|
||||||
Boolean, default=False, nullable=True
|
Boolean, default=False, nullable=True
|
||||||
)
|
)
|
||||||
# 0 for neutral, positive for mostly endorse, negative for mostly reject
|
# 0 for neutral, positive for mostly endorse, negative for mostly reject
|
||||||
boost: Mapped[int] = mapped_column(Integer, default=DEFAULT_BOOST)
|
boost: Mapped[int] = mapped_column(Integer, default=DEFAULT_BOOST)
|
||||||
hidden: Mapped[bool] = mapped_column(Boolean, default=False)
|
hidden: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||||
semantic_id: Mapped[str] = mapped_column(String)
|
semantic_id: Mapped[str] = mapped_column(NullFilteredString)
|
||||||
# First Section's link
|
# First Section's link
|
||||||
link: Mapped[str | None] = mapped_column(String, nullable=True)
|
link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
|
||||||
|
|
||||||
# The updated time is also used as a measure of the last successful state of the doc
|
# The updated time is also used as a measure of the last successful state of the doc
|
||||||
# pulled from the source (to help skip reindexing already updated docs in case of
|
# pulled from the source (to help skip reindexing already updated docs in case of
|
||||||
|
Loading…
x
Reference in New Issue
Block a user