mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 21:05:17 +02:00
Each section is now cleaned before being chunked (#3210)
* Each section is now cleaned before being chunked * k --------- Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
This commit is contained in:
@@ -14,6 +14,7 @@ from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
|||||||
from danswer.indexing.models import DocAwareChunk
|
from danswer.indexing.models import DocAwareChunk
|
||||||
from danswer.natural_language_processing.utils import BaseTokenizer
|
from danswer.natural_language_processing.utils import BaseTokenizer
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
from danswer.utils.text_processing import clean_text
|
||||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||||
|
|
||||||
@@ -220,9 +221,20 @@ class Chunker:
|
|||||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||||
)
|
)
|
||||||
|
|
||||||
for section in document.sections:
|
for section_idx, section in enumerate(document.sections):
|
||||||
section_text = section.text
|
section_text = clean_text(section.text)
|
||||||
section_link_text = section.link or ""
|
section_link_text = section.link or ""
|
||||||
|
# If there is no useful content, not even the title, just drop it
|
||||||
|
if not section_text and (not document.title or section_idx > 0):
|
||||||
|
# If a section is empty and the document has no title, we can just drop it. We return a list of
|
||||||
|
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
|
||||||
|
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
|
||||||
|
logger.warning(
|
||||||
|
f"Skipping section {section.text} from document "
|
||||||
|
f"{document.semantic_identifier} due to empty text after cleaning "
|
||||||
|
f" with link {section_link_text}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
section_token_count = len(self.tokenizer.tokenize(section_text))
|
section_token_count = len(self.tokenizer.tokenize(section_text))
|
||||||
|
|
||||||
@@ -238,31 +250,26 @@ class Chunker:
|
|||||||
split_texts = self.chunk_splitter.split_text(section_text)
|
split_texts = self.chunk_splitter.split_text(section_text)
|
||||||
|
|
||||||
for i, split_text in enumerate(split_texts):
|
for i, split_text in enumerate(split_texts):
|
||||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
if (
|
||||||
|
STRICT_CHUNK_TOKEN_LIMIT
|
||||||
if STRICT_CHUNK_TOKEN_LIMIT:
|
and
|
||||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
# Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
|
||||||
if split_token_count > content_token_limit:
|
len(self.tokenizer.tokenize(split_text)) > content_token_limit
|
||||||
# Further split the oversized chunk
|
):
|
||||||
smaller_chunks = self._split_oversized_chunk(
|
# If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
|
||||||
split_text, content_token_limit
|
# the token count of each split text to ensure it is
|
||||||
)
|
# not larger than the content_token_limit
|
||||||
for i, small_chunk in enumerate(smaller_chunks):
|
smaller_chunks = self._split_oversized_chunk(
|
||||||
chunks.append(
|
split_text, content_token_limit
|
||||||
_create_chunk(
|
)
|
||||||
text=small_chunk,
|
for i, small_chunk in enumerate(smaller_chunks):
|
||||||
links={0: section_link_text},
|
|
||||||
is_continuation=(i != 0),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
chunks.append(
|
chunks.append(
|
||||||
_create_chunk(
|
_create_chunk(
|
||||||
text=split_text,
|
text=small_chunk,
|
||||||
links={0: section_link_text},
|
links={0: section_link_text},
|
||||||
|
is_continuation=(i != 0),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
chunks.append(
|
chunks.append(
|
||||||
_create_chunk(
|
_create_chunk(
|
||||||
@@ -354,6 +361,10 @@ class Chunker:
|
|||||||
return normal_chunks
|
return normal_chunks
|
||||||
|
|
||||||
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
|
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
|
||||||
|
"""
|
||||||
|
Takes in a list of documents and chunks them into smaller chunks for indexing
|
||||||
|
while persisting the document metadata.
|
||||||
|
"""
|
||||||
final_chunks: list[DocAwareChunk] = []
|
final_chunks: list[DocAwareChunk] = []
|
||||||
for document in documents:
|
for document in documents:
|
||||||
if self.callback:
|
if self.callback:
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
import re
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
|
|||||||
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
|
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
|
||||||
|
|
||||||
|
|
||||||
_INITIAL_FILTER = re.compile(
|
|
||||||
"["
|
|
||||||
"\U0000FFF0-\U0000FFFF" # Specials
|
|
||||||
"\U0001F000-\U0001F9FF" # Emoticons
|
|
||||||
"\U00002000-\U0000206F" # General Punctuation
|
|
||||||
"\U00002190-\U000021FF" # Arrows
|
|
||||||
"\U00002700-\U000027BF" # Dingbats
|
|
||||||
"]+",
|
|
||||||
flags=re.UNICODE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def clean_openai_text(text: str) -> str:
|
|
||||||
# Remove specific Unicode ranges that might cause issues
|
|
||||||
cleaned = _INITIAL_FILTER.sub("", text)
|
|
||||||
|
|
||||||
# Remove any control characters except for newline and tab
|
|
||||||
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
|
|
||||||
|
|
||||||
return cleaned
|
|
||||||
|
|
||||||
|
|
||||||
def build_model_server_url(
|
def build_model_server_url(
|
||||||
model_server_host: str,
|
model_server_host: str,
|
||||||
model_server_port: int,
|
model_server_port: int,
|
||||||
@@ -215,11 +192,6 @@ class EmbeddingModel:
|
|||||||
for text in texts
|
for text in texts
|
||||||
]
|
]
|
||||||
|
|
||||||
if self.provider_type == EmbeddingProvider.OPENAI:
|
|
||||||
# If the provider is openai, we need to clean the text
|
|
||||||
# as a temporary workaround for the openai API
|
|
||||||
texts = [clean_openai_text(text) for text in texts]
|
|
||||||
|
|
||||||
batch_size = (
|
batch_size = (
|
||||||
api_embedding_batch_size
|
api_embedding_batch_size
|
||||||
if self.provider_type
|
if self.provider_type
|
||||||
|
@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
_INITIAL_FILTER = re.compile(
|
||||||
|
"["
|
||||||
|
"\U0000FFF0-\U0000FFFF" # Specials
|
||||||
|
"\U0001F000-\U0001F9FF" # Emoticons
|
||||||
|
"\U00002000-\U0000206F" # General Punctuation
|
||||||
|
"\U00002190-\U000021FF" # Arrows
|
||||||
|
"\U00002700-\U000027BF" # Dingbats
|
||||||
|
"]+",
|
||||||
|
flags=re.UNICODE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
# Remove specific Unicode ranges that might cause issues
|
||||||
|
cleaned = _INITIAL_FILTER.sub("", text)
|
||||||
|
|
||||||
|
# Remove any control characters except for newline and tab
|
||||||
|
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
def is_valid_email(text: str) -> bool:
|
def is_valid_email(text: str) -> bool:
|
||||||
"""Can use a library instead if more detailed checks are needed"""
|
"""Can use a library instead if more detailed checks are needed"""
|
||||||
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
||||||
|
Reference in New Issue
Block a user