mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 21:05:17 +02:00
Each section is now cleaned before being chunked (#3210)
* Each section is now cleaned before being chunked * k --------- Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
This commit is contained in:
@@ -14,6 +14,7 @@ from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from danswer.indexing.models import DocAwareChunk
|
||||
from danswer.natural_language_processing.utils import BaseTokenizer
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.text_processing import clean_text
|
||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
@@ -220,9 +221,20 @@ class Chunker:
|
||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||
)
|
||||
|
||||
for section in document.sections:
|
||||
section_text = section.text
|
||||
for section_idx, section in enumerate(document.sections):
|
||||
section_text = clean_text(section.text)
|
||||
section_link_text = section.link or ""
|
||||
# If there is no useful content, not even the title, just drop it
|
||||
if not section_text and (not document.title or section_idx > 0):
|
||||
# If a section is empty and the document has no title, we can just drop it. We return a list of
|
||||
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
|
||||
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
|
||||
logger.warning(
|
||||
f"Skipping section {section.text} from document "
|
||||
f"{document.semantic_identifier} due to empty text after cleaning "
|
||||
f" with link {section_link_text}"
|
||||
)
|
||||
continue
|
||||
|
||||
section_token_count = len(self.tokenizer.tokenize(section_text))
|
||||
|
||||
@@ -238,31 +250,26 @@ class Chunker:
|
||||
split_texts = self.chunk_splitter.split_text(section_text)
|
||||
|
||||
for i, split_text in enumerate(split_texts):
|
||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||
|
||||
if STRICT_CHUNK_TOKEN_LIMIT:
|
||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||
if split_token_count > content_token_limit:
|
||||
# Further split the oversized chunk
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for i, small_chunk in enumerate(smaller_chunks):
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=small_chunk,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
else:
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and
|
||||
# Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
|
||||
len(self.tokenizer.tokenize(split_text)) > content_token_limit
|
||||
):
|
||||
# If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
|
||||
# the token count of each split text to ensure it is
|
||||
# not larger than the content_token_limit
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for i, small_chunk in enumerate(smaller_chunks):
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=split_text,
|
||||
text=small_chunk,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
|
||||
else:
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
@@ -354,6 +361,10 @@ class Chunker:
|
||||
return normal_chunks
|
||||
|
||||
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Takes in a list of documents and chunks them into smaller chunks for indexing
|
||||
while persisting the document metadata.
|
||||
"""
|
||||
final_chunks: list[DocAwareChunk] = []
|
||||
for document in documents:
|
||||
if self.callback:
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
|
||||
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
|
||||
|
||||
|
||||
_INITIAL_FILTER = re.compile(
|
||||
"["
|
||||
"\U0000FFF0-\U0000FFFF" # Specials
|
||||
"\U0001F000-\U0001F9FF" # Emoticons
|
||||
"\U00002000-\U0000206F" # General Punctuation
|
||||
"\U00002190-\U000021FF" # Arrows
|
||||
"\U00002700-\U000027BF" # Dingbats
|
||||
"]+",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
def clean_openai_text(text: str) -> str:
|
||||
# Remove specific Unicode ranges that might cause issues
|
||||
cleaned = _INITIAL_FILTER.sub("", text)
|
||||
|
||||
# Remove any control characters except for newline and tab
|
||||
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def build_model_server_url(
|
||||
model_server_host: str,
|
||||
model_server_port: int,
|
||||
@@ -215,11 +192,6 @@ class EmbeddingModel:
|
||||
for text in texts
|
||||
]
|
||||
|
||||
if self.provider_type == EmbeddingProvider.OPENAI:
|
||||
# If the provider is openai, we need to clean the text
|
||||
# as a temporary workaround for the openai API
|
||||
texts = [clean_openai_text(text) for text in texts]
|
||||
|
||||
batch_size = (
|
||||
api_embedding_batch_size
|
||||
if self.provider_type
|
||||
|
@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
_INITIAL_FILTER = re.compile(
|
||||
"["
|
||||
"\U0000FFF0-\U0000FFFF" # Specials
|
||||
"\U0001F000-\U0001F9FF" # Emoticons
|
||||
"\U00002000-\U0000206F" # General Punctuation
|
||||
"\U00002190-\U000021FF" # Arrows
|
||||
"\U00002700-\U000027BF" # Dingbats
|
||||
"]+",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
# Remove specific Unicode ranges that might cause issues
|
||||
cleaned = _INITIAL_FILTER.sub("", text)
|
||||
|
||||
# Remove any control characters except for newline and tab
|
||||
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def is_valid_email(text: str) -> bool:
|
||||
"""Can use a library instead if more detailed checks are needed"""
|
||||
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
||||
|
Reference in New Issue
Block a user