Each section is now cleaned before being chunked (#3210)

* Each section is now cleaned before being chunked

* k

---------

Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
This commit is contained in:
hagen-danswer
2024-11-22 11:06:19 -08:00
committed by GitHub
parent 129c8f8faf
commit 5dc07d4178
3 changed files with 55 additions and 50 deletions

View File

@@ -14,6 +14,7 @@ from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.models import DocAwareChunk from danswer.indexing.models import DocAwareChunk
from danswer.natural_language_processing.utils import BaseTokenizer from danswer.natural_language_processing.utils import BaseTokenizer
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import clean_text
from danswer.utils.text_processing import shared_precompare_cleanup from danswer.utils.text_processing import shared_precompare_cleanup
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
@@ -220,9 +221,20 @@ class Chunker:
mini_chunk_texts=self._get_mini_chunk_texts(text), mini_chunk_texts=self._get_mini_chunk_texts(text),
) )
for section in document.sections: for section_idx, section in enumerate(document.sections):
section_text = section.text section_text = clean_text(section.text)
section_link_text = section.link or "" section_link_text = section.link or ""
# If there is no useful content, not even the title, just drop it
if not section_text and (not document.title or section_idx > 0):
# If a section is empty and the document has no title, we can just drop it. We return a list of
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
logger.warning(
f"Skipping section {section.text} from document "
f"{document.semantic_identifier} due to empty text after cleaning "
f" with link {section_link_text}"
)
continue
section_token_count = len(self.tokenizer.tokenize(section_text)) section_token_count = len(self.tokenizer.tokenize(section_text))
@@ -238,31 +250,26 @@ class Chunker:
split_texts = self.chunk_splitter.split_text(section_text) split_texts = self.chunk_splitter.split_text(section_text)
for i, split_text in enumerate(split_texts): for i, split_text in enumerate(split_texts):
split_token_count = len(self.tokenizer.tokenize(split_text)) if (
STRICT_CHUNK_TOKEN_LIMIT
if STRICT_CHUNK_TOKEN_LIMIT: and
split_token_count = len(self.tokenizer.tokenize(split_text)) # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
if split_token_count > content_token_limit: len(self.tokenizer.tokenize(split_text)) > content_token_limit
# Further split the oversized chunk ):
smaller_chunks = self._split_oversized_chunk( # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
split_text, content_token_limit # the token count of each split text to ensure it is
) # not larger than the content_token_limit
for i, small_chunk in enumerate(smaller_chunks): smaller_chunks = self._split_oversized_chunk(
chunks.append( split_text, content_token_limit
_create_chunk( )
text=small_chunk, for i, small_chunk in enumerate(smaller_chunks):
links={0: section_link_text},
is_continuation=(i != 0),
)
)
else:
chunks.append( chunks.append(
_create_chunk( _create_chunk(
text=split_text, text=small_chunk,
links={0: section_link_text}, links={0: section_link_text},
is_continuation=(i != 0),
) )
) )
else: else:
chunks.append( chunks.append(
_create_chunk( _create_chunk(
@@ -354,6 +361,10 @@ class Chunker:
return normal_chunks return normal_chunks
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]: def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
"""
Takes in a list of documents and chunks them into smaller chunks for indexing
while persisting the document metadata.
"""
final_chunks: list[DocAwareChunk] = [] final_chunks: list[DocAwareChunk] = []
for document in documents: for document in documents:
if self.callback: if self.callback:

View File

@@ -1,4 +1,3 @@
import re
import threading import threading
import time import time
from collections.abc import Callable from collections.abc import Callable
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
return model_str.replace("/", "_").replace("-", "_").replace(".", "_") return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
_INITIAL_FILTER = re.compile(
"["
"\U0000FFF0-\U0000FFFF" # Specials
"\U0001F000-\U0001F9FF" # Emoticons
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002700-\U000027BF" # Dingbats
"]+",
flags=re.UNICODE,
)
def clean_openai_text(text: str) -> str:
# Remove specific Unicode ranges that might cause issues
cleaned = _INITIAL_FILTER.sub("", text)
# Remove any control characters except for newline and tab
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
return cleaned
def build_model_server_url( def build_model_server_url(
model_server_host: str, model_server_host: str,
model_server_port: int, model_server_port: int,
@@ -215,11 +192,6 @@ class EmbeddingModel:
for text in texts for text in texts
] ]
if self.provider_type == EmbeddingProvider.OPENAI:
# If the provider is openai, we need to clean the text
# as a temporary workaround for the openai API
texts = [clean_openai_text(text) for text in texts]
batch_size = ( batch_size = (
api_embedding_batch_size api_embedding_batch_size
if self.provider_type if self.provider_type

View File

@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
return text return text
_INITIAL_FILTER = re.compile(
"["
"\U0000FFF0-\U0000FFFF" # Specials
"\U0001F000-\U0001F9FF" # Emoticons
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002700-\U000027BF" # Dingbats
"]+",
flags=re.UNICODE,
)
def clean_text(text: str) -> str:
# Remove specific Unicode ranges that might cause issues
cleaned = _INITIAL_FILTER.sub("", text)
# Remove any control characters except for newline and tab
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
return cleaned
def is_valid_email(text: str) -> bool: def is_valid_email(text: str) -> bool:
"""Can use a library instead if more detailed checks are needed""" """Can use a library instead if more detailed checks are needed"""
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"