Files
danswer/backend/danswer/indexing/chunker.py
2024-08-01 10:18:02 -07:00

314 lines
12 KiB
Python

import abc
from collections.abc import Callable
from typing import Optional
from typing import TYPE_CHECKING
from danswer.configs.app_configs import BLURB_SIZE
from danswer.configs.app_configs import ENABLE_MINI_CHUNK
from danswer.configs.app_configs import MINI_CHUNK_SIZE
from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import RETURN_SEPARATOR
from danswer.configs.constants import SECTION_SEPARATOR
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_metadata_keys_to_ignore,
)
from danswer.connectors.models import Document
from danswer.indexing.embedder import IndexingEmbedder
from danswer.indexing.models import DocAwareChunk
from danswer.natural_language_processing.utils import get_tokenizer
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import shared_precompare_cleanup
if TYPE_CHECKING:
from llama_index.text_splitter import SentenceSplitter # type:ignore
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
# actually help quality at all
CHUNK_OVERLAP = 0
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
# overwhelm the actual contents of the chunk
# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix
# could be another 128 tokens leaving 256 for the actual contents
MAX_METADATA_PERCENTAGE = 0.25
CHUNK_MIN_CONTENT = 256
logger = setup_logger()
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
texts = blurb_splitter.split_text(text)
if not texts:
return ""
return texts[0]
def chunk_large_section(
section_text: str,
section_link_text: str,
document: Document,
start_chunk_id: int,
blurb: str,
chunk_splitter: "SentenceSplitter",
mini_chunk_splitter: Optional["SentenceSplitter"],
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
) -> list[DocAwareChunk]:
split_texts = chunk_splitter.split_text(section_text)
chunks = [
DocAwareChunk(
source_document=document,
chunk_id=start_chunk_id + chunk_ind,
blurb=blurb,
content=chunk_text,
source_links={0: section_link_text},
section_continuation=(chunk_ind != 0),
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=mini_chunk_splitter.split_text(chunk_text)
if mini_chunk_splitter and chunk_text.strip()
else None,
)
for chunk_ind, chunk_text in enumerate(split_texts)
]
return chunks
def _get_metadata_suffix_for_document_index(
metadata: dict[str, str | list[str]], include_separator: bool = False
) -> tuple[str, str]:
"""
Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding
and a string of all of the values for the keyword search
For example, if we have the following metadata:
{
"author": "John Doe",
"space": "Engineering"
}
The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe
and Engineering. The keys are repeat and much more noisy.
"""
if not metadata:
return "", ""
metadata_str = "Metadata:\n"
metadata_values = []
for key, value in metadata.items():
if key in get_metadata_keys_to_ignore():
continue
value_str = ", ".join(value) if isinstance(value, list) else value
if isinstance(value, list):
metadata_values.extend(value)
else:
metadata_values.append(value)
metadata_str += f"\t{key} - {value_str}\n"
metadata_semantic = metadata_str.strip()
metadata_keyword = " ".join(metadata_values)
if include_separator:
return RETURN_SEPARATOR + metadata_semantic, RETURN_SEPARATOR + metadata_keyword
return metadata_semantic, metadata_keyword
def chunk_document(
document: Document,
embedder: IndexingEmbedder,
chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
subsection_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE, # Used for both title and content
include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
mini_chunk_size: int = MINI_CHUNK_SIZE,
enable_mini_chunk: bool = ENABLE_MINI_CHUNK,
) -> list[DocAwareChunk]:
from llama_index.text_splitter import SentenceSplitter
tokenizer = get_tokenizer(
model_name=embedder.model_name,
provider_type=embedder.provider_type,
)
blurb_splitter = SentenceSplitter(
tokenizer=tokenizer.tokenize, chunk_size=blurb_size, chunk_overlap=0
)
chunk_splitter = SentenceSplitter(
tokenizer=tokenizer.tokenize,
chunk_size=chunk_tok_size,
chunk_overlap=subsection_overlap,
)
mini_chunk_splitter = SentenceSplitter(
tokenizer=tokenizer.tokenize,
chunk_size=mini_chunk_size,
chunk_overlap=0,
)
title = extract_blurb(document.get_title_for_document_index() or "", blurb_splitter)
title_prefix = title + RETURN_SEPARATOR if title else ""
title_tokens = len(tokenizer.tokenize(title_prefix))
metadata_suffix_semantic = ""
metadata_suffix_keyword = ""
metadata_tokens = 0
if include_metadata:
(
metadata_suffix_semantic,
metadata_suffix_keyword,
) = _get_metadata_suffix_for_document_index(
document.metadata, include_separator=True
)
metadata_tokens = len(tokenizer.tokenize(metadata_suffix_semantic))
if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
# Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model
# context, there is no limit for the keyword component
metadata_suffix_semantic = ""
metadata_tokens = 0
content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
# If there is not enough context remaining then just index the chunk with no prefix/suffix
if content_token_limit <= CHUNK_MIN_CONTENT:
content_token_limit = chunk_tok_size
title_prefix = ""
metadata_suffix_semantic = ""
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
for section in document.sections:
section_text = section.text
section_link_text = section.link or ""
section_tok_length = len(tokenizer.tokenize(section_text))
current_tok_length = len(tokenizer.tokenize(chunk_text))
curr_offset_len = len(shared_precompare_cleanup(chunk_text))
# Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
# at the end by other sections
if section_tok_length > content_token_limit:
if chunk_text:
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_splitter),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=mini_chunk_splitter.split_text(chunk_text)
if enable_mini_chunk and chunk_text.strip()
else None,
)
)
link_offsets = {}
chunk_text = ""
large_section_chunks = chunk_large_section(
section_text=section_text,
section_link_text=section_link_text,
document=document,
start_chunk_id=len(chunks),
chunk_splitter=chunk_splitter,
mini_chunk_splitter=mini_chunk_splitter
if enable_mini_chunk and chunk_text.strip()
else None,
blurb=extract_blurb(section_text, blurb_splitter),
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
)
chunks.extend(large_section_chunks)
continue
# In the case where the whole section is shorter than a chunk, either adding to chunk or start a new one
if (
current_tok_length
+ len(tokenizer.tokenize(SECTION_SEPARATOR))
+ section_tok_length
<= content_token_limit
):
chunk_text += (
SECTION_SEPARATOR + section_text if chunk_text else section_text
)
link_offsets[curr_offset_len] = section_link_text
else:
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_splitter),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=mini_chunk_splitter.split_text(chunk_text)
if enable_mini_chunk and chunk_text.strip()
else None,
)
)
link_offsets = {0: section_link_text}
chunk_text = section_text
# Once we hit the end, if we're still in the process of building a chunk, add what we have. If there is only whitespace left
# then don't include it. If there are no chunks at all from the doc, we can just create a single chunk with the title.
if chunk_text.strip() or not chunks:
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_splitter),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=mini_chunk_splitter.split_text(chunk_text)
if enable_mini_chunk and chunk_text.strip()
else None,
)
)
# If the chunk does not have any useable content, it will not be indexed
return chunks
class Chunker:
@abc.abstractmethod
def chunk(
self,
document: Document,
embedder: IndexingEmbedder,
) -> list[DocAwareChunk]:
raise NotImplementedError
class DefaultChunker(Chunker):
def chunk(
self,
document: Document,
embedder: IndexingEmbedder,
) -> list[DocAwareChunk]:
# Specifically for reproducing an issue with gmail
if document.source == DocumentSource.GMAIL:
logger.debug(f"Chunking {document.semantic_identifier}")
return chunk_document(document, embedder=embedder)