mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 12:30:49 +02:00
Include Titles in Chunks (#1817)
This commit is contained in:
parent
8c312482c1
commit
e90c66c1b6
@ -10,8 +10,8 @@ import sqlalchemy as sa
|
||||
|
||||
revision = "7aea705850d5"
|
||||
down_revision = "4505fd7302e1"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
branch_labels: None = None
|
||||
depends_on: None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
|
@ -243,13 +243,15 @@ DISABLE_INDEX_UPDATE_ON_SWAP = (
|
||||
# fairly large amount of memory in order to increase substantially, since
|
||||
# each worker loads the embedding models into memory.
|
||||
NUM_INDEXING_WORKERS = int(os.environ.get("NUM_INDEXING_WORKERS") or 1)
|
||||
CHUNK_OVERLAP = 0
|
||||
# More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
|
||||
ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
|
||||
# Finer grained chunking for more detail retention
|
||||
# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
|
||||
# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
|
||||
MINI_CHUNK_SIZE = 150
|
||||
# Include the document level metadata in each chunk. If the metadata is too long, then it is thrown out
|
||||
# We don't want the metadata to overwhelm the actual contents of the chunk
|
||||
SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == "true"
|
||||
# Timeout to wait for job's last update before killing it, in hours
|
||||
CLEANUP_INDEXING_JOBS_TIMEOUT = int(os.environ.get("CLEANUP_INDEXING_JOBS_TIMEOUT", 3))
|
||||
|
||||
|
@ -19,6 +19,7 @@ DOCUMENT_SETS = "document_sets"
|
||||
TIME_FILTER = "time_filter"
|
||||
METADATA = "metadata"
|
||||
METADATA_LIST = "metadata_list"
|
||||
METADATA_SUFFIX = "metadata_suffix"
|
||||
MATCH_HIGHLIGHTS = "match_highlights"
|
||||
# stored in the `metadata` of a chunk. Used to signify that this chunk should
|
||||
# not be used for QA. For example, Google Drive file types which can't be parsed
|
||||
@ -43,7 +44,8 @@ QUERY_EVENT_ID = "query_event_id"
|
||||
LLM_CHUNKS = "llm_chunks"
|
||||
|
||||
# For chunking/processing chunks
|
||||
TITLE_SEPARATOR = "\n\r\n"
|
||||
MAX_CHUNK_TITLE_LEN = 1000
|
||||
RETURN_SEPARATOR = "\n\r\n"
|
||||
SECTION_SEPARATOR = "\n\n"
|
||||
# For combining attributes, doesn't have to be unique/perfect to work
|
||||
INDEX_SEPARATOR = "==="
|
||||
|
@ -6,6 +6,7 @@ from typing import TypeVar
|
||||
|
||||
from dateutil.parser import parse
|
||||
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.utils.text_processing import is_valid_email
|
||||
|
||||
@ -57,3 +58,7 @@ def process_in_batches(
|
||||
) -> Iterator[list[U]]:
|
||||
for i in range(0, len(objects), batch_size):
|
||||
yield [process_function(obj) for obj in objects[i : i + batch_size]]
|
||||
|
||||
|
||||
def get_metadata_keys_to_ignore() -> list[str]:
|
||||
return [IGNORE_FOR_QA]
|
||||
|
@ -6,6 +6,7 @@ from pydantic import BaseModel
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import INDEX_SEPARATOR
|
||||
from danswer.configs.constants import RETURN_SEPARATOR
|
||||
from danswer.utils.text_processing import make_url_compatible
|
||||
|
||||
|
||||
@ -117,7 +118,12 @@ class DocumentBase(BaseModel):
|
||||
# If title is explicitly empty, return a None here for embedding purposes
|
||||
if self.title == "":
|
||||
return None
|
||||
return self.semantic_identifier if self.title is None else self.title
|
||||
replace_chars = set(RETURN_SEPARATOR)
|
||||
title = self.semantic_identifier if self.title is None else self.title
|
||||
for char in replace_chars:
|
||||
title = title.replace(char, " ")
|
||||
title = title.strip()
|
||||
return title
|
||||
|
||||
def get_metadata_str_attributes(self) -> list[str] | None:
|
||||
if not self.metadata:
|
||||
|
@ -6,7 +6,7 @@ from typing import Any
|
||||
from danswer.access.models import DocumentAccess
|
||||
from danswer.indexing.models import DocMetadataAwareIndexChunk
|
||||
from danswer.search.models import IndexFilters
|
||||
from danswer.search.models import InferenceChunk
|
||||
from danswer.search.models import InferenceChunkUncleaned
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -186,7 +186,7 @@ class IdRetrievalCapable(abc.ABC):
|
||||
min_chunk_ind: int | None,
|
||||
max_chunk_ind: int | None,
|
||||
user_access_control_list: list[str] | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Fetch chunk(s) based on document id
|
||||
|
||||
@ -222,7 +222,7 @@ class KeywordCapable(abc.ABC):
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
|
||||
information required for query time purposes. For example, some details of the document
|
||||
@ -262,7 +262,7 @@ class VectorCapable(abc.ABC):
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run vector/semantic search and return a list of inference chunks.
|
||||
|
||||
@ -298,7 +298,7 @@ class HybridCapable(abc.ABC):
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
hybrid_alpha: float | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run hybrid search and return a list of inference chunks.
|
||||
|
||||
@ -348,7 +348,7 @@ class AdminCapable(abc.ABC):
|
||||
filters: IndexFilters,
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run the special search for the admin document explorer page
|
||||
|
||||
|
@ -91,6 +91,9 @@ schema DANSWER_CHUNK_NAME {
|
||||
field metadata type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field metadata_suffix type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field doc_updated_at type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ from danswer.configs.constants import HIDDEN
|
||||
from danswer.configs.constants import INDEX_SEPARATOR
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import METADATA_LIST
|
||||
from danswer.configs.constants import METADATA_SUFFIX
|
||||
from danswer.configs.constants import PRIMARY_OWNERS
|
||||
from danswer.configs.constants import RECENCY_BIAS
|
||||
from danswer.configs.constants import SECONDARY_OWNERS
|
||||
@ -51,7 +52,6 @@ from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
from danswer.configs.constants import TITLE_EMBEDDING
|
||||
from danswer.configs.constants import TITLE_SEPARATOR
|
||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
@ -64,7 +64,7 @@ from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
|
||||
from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
|
||||
from danswer.indexing.models import DocMetadataAwareIndexChunk
|
||||
from danswer.search.models import IndexFilters
|
||||
from danswer.search.models import InferenceChunk
|
||||
from danswer.search.models import InferenceChunkUncleaned
|
||||
from danswer.search.retrieval.search_runner import query_processing
|
||||
from danswer.search.retrieval.search_runner import remove_stop_words_and_punctuation
|
||||
from danswer.utils.batching import batch_generator
|
||||
@ -347,8 +347,10 @@ def _index_vespa_chunk(
|
||||
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
||||
SKIP_TITLE_EMBEDDING: not title,
|
||||
CONTENT: remove_invalid_unicode_chars(chunk.content),
|
||||
# This duplication of `content` is needed for keyword highlighting :(
|
||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
|
||||
# This duplication of `content` is needed for keyword highlighting
|
||||
# Note that it's not exactly the same as the actual content
|
||||
# which contains the title prefix and metadata suffix
|
||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary),
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
|
||||
@ -356,6 +358,7 @@ def _index_vespa_chunk(
|
||||
METADATA: json.dumps(document.metadata),
|
||||
# Save as a list for efficient extraction as an Attribute
|
||||
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
|
||||
METADATA_SUFFIX: chunk.metadata_suffix,
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
TITLE_EMBEDDING: chunk.title_embedding,
|
||||
BOOST: chunk.boost,
|
||||
@ -562,7 +565,7 @@ def _process_dynamic_summary(
|
||||
|
||||
def _vespa_hit_to_inference_chunk(
|
||||
hit: dict[str, Any], null_score: bool = False
|
||||
) -> InferenceChunk:
|
||||
) -> InferenceChunkUncleaned:
|
||||
fields = cast(dict[str, Any], hit["fields"])
|
||||
|
||||
# parse fields that are stored as strings, but are really json / datetime
|
||||
@ -585,19 +588,6 @@ def _vespa_hit_to_inference_chunk(
|
||||
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||
)
|
||||
|
||||
# Remove the title from the first chunk as every chunk already included
|
||||
# its semantic identifier for LLM
|
||||
content = fields[CONTENT]
|
||||
if fields[CHUNK_ID] == 0:
|
||||
parts = content.split(TITLE_SEPARATOR, maxsplit=1)
|
||||
content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
|
||||
|
||||
# User ran into this, not sure why this could happen, error checking here
|
||||
blurb = fields.get(BLURB)
|
||||
if not blurb:
|
||||
logger.error(f"Chunk with id {fields.get(semantic_identifier)} ")
|
||||
blurb = ""
|
||||
|
||||
source_links = fields.get(SOURCE_LINKS, {})
|
||||
source_links_dict_unprocessed = (
|
||||
json.loads(source_links) if isinstance(source_links, str) else source_links
|
||||
@ -607,14 +597,15 @@ def _vespa_hit_to_inference_chunk(
|
||||
for k, v in cast(dict[str, str], source_links_dict_unprocessed).items()
|
||||
}
|
||||
|
||||
return InferenceChunk(
|
||||
return InferenceChunkUncleaned(
|
||||
chunk_id=fields[CHUNK_ID],
|
||||
blurb=blurb,
|
||||
content=content,
|
||||
blurb=fields.get(BLURB, ""), # Unused
|
||||
content=fields[CONTENT], # Includes extra title prefix and metadata suffix
|
||||
source_links=source_links_dict,
|
||||
section_continuation=fields[SECTION_CONTINUATION],
|
||||
document_id=fields[DOCUMENT_ID],
|
||||
source_type=fields[SOURCE_TYPE],
|
||||
title=fields[TITLE],
|
||||
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
|
||||
boost=fields.get(BOOST, 1),
|
||||
recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
|
||||
@ -623,13 +614,16 @@ def _vespa_hit_to_inference_chunk(
|
||||
primary_owners=fields.get(PRIMARY_OWNERS),
|
||||
secondary_owners=fields.get(SECONDARY_OWNERS),
|
||||
metadata=metadata,
|
||||
metadata_suffix=fields.get(METADATA_SUFFIX) or "",
|
||||
match_highlights=match_highlights,
|
||||
updated_at=updated_at,
|
||||
)
|
||||
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[InferenceChunk]:
|
||||
def _query_vespa(
|
||||
query_params: Mapping[str, str | int | float]
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError("No/empty query received")
|
||||
|
||||
@ -684,16 +678,6 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
|
||||
return inference_chunks
|
||||
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _inference_chunk_by_vespa_id(vespa_id: str, index_name: str) -> InferenceChunk:
|
||||
res = requests.get(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_id}"
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
return _vespa_hit_to_inference_chunk(res.json())
|
||||
|
||||
|
||||
def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO:
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
@ -738,6 +722,7 @@ class VespaIndex(DocumentIndex):
|
||||
f"{SOURCE_TYPE}, "
|
||||
f"{SOURCE_LINKS}, "
|
||||
f"{SEMANTIC_IDENTIFIER}, "
|
||||
f"{TITLE}, "
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
@ -745,6 +730,7 @@ class VespaIndex(DocumentIndex):
|
||||
f"{PRIMARY_OWNERS}, "
|
||||
f"{SECONDARY_OWNERS}, "
|
||||
f"{METADATA}, "
|
||||
f"{METADATA_SUFFIX}, "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {{index_name}} where "
|
||||
)
|
||||
@ -980,7 +966,7 @@ class VespaIndex(DocumentIndex):
|
||||
min_chunk_ind: int | None,
|
||||
max_chunk_ind: int | None,
|
||||
user_access_control_list: list[str] | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
document_id = replace_invalid_doc_id_characters(document_id)
|
||||
|
||||
vespa_chunks = _get_vespa_chunks_by_document_id(
|
||||
@ -1009,7 +995,7 @@ class VespaIndex(DocumentIndex):
|
||||
num_to_retrieve: int = NUM_RETURNED_HITS,
|
||||
offset: int = 0,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
yql = (
|
||||
@ -1046,7 +1032,7 @@ class VespaIndex(DocumentIndex):
|
||||
offset: int = 0,
|
||||
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
yql = (
|
||||
@ -1090,7 +1076,7 @@ class VespaIndex(DocumentIndex):
|
||||
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
|
||||
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
# Needs to be at least as much as the value set in Vespa schema config
|
||||
target_hits = max(10 * num_to_retrieve, 1000)
|
||||
@ -1134,7 +1120,7 @@ class VespaIndex(DocumentIndex):
|
||||
filters: IndexFilters,
|
||||
num_to_retrieve: int = NUM_RETURNED_HITS,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
vespa_where_clauses = _build_vespa_filters(filters, include_hidden=True)
|
||||
yql = (
|
||||
VespaIndex.yql_base.format(index_name=self.index_name)
|
||||
|
@ -3,12 +3,16 @@ from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from danswer.configs.app_configs import BLURB_SIZE
|
||||
from danswer.configs.app_configs import CHUNK_OVERLAP
|
||||
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
||||
from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
|
||||
from danswer.configs.constants import RETURN_SEPARATOR
|
||||
from danswer.configs.constants import SECTION_SEPARATOR
|
||||
from danswer.configs.constants import TITLE_SEPARATOR
|
||||
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_metadata_keys_to_ignore,
|
||||
)
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.indexing.models import DocAwareChunk
|
||||
from danswer.search.search_nlp_models import get_default_tokenizer
|
||||
@ -19,6 +23,14 @@ if TYPE_CHECKING:
|
||||
from transformers import AutoTokenizer # type:ignore
|
||||
|
||||
|
||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||
# actually help quality at all
|
||||
CHUNK_OVERLAP = 0
|
||||
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
|
||||
# overwhelm the actual contents of the chunk
|
||||
MAX_METADATA_PERCENTAGE = 0.25
|
||||
CHUNK_MIN_CONTENT = 256
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
||||
@ -44,6 +56,8 @@ def chunk_large_section(
|
||||
chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
|
||||
chunk_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix: str = "",
|
||||
) -> list[DocAwareChunk]:
|
||||
from llama_index.text_splitter import SentenceSplitter
|
||||
|
||||
@ -60,30 +74,69 @@ def chunk_large_section(
|
||||
source_document=document,
|
||||
chunk_id=start_chunk_id + chunk_ind,
|
||||
blurb=blurb,
|
||||
content=chunk_str,
|
||||
content=f"{title_prefix}{chunk_str}{metadata_suffix}",
|
||||
content_summary=chunk_str,
|
||||
source_links={0: section_link_text},
|
||||
section_continuation=(chunk_ind != 0),
|
||||
metadata_suffix=metadata_suffix,
|
||||
)
|
||||
for chunk_ind, chunk_str in enumerate(split_texts)
|
||||
]
|
||||
return chunks
|
||||
|
||||
|
||||
def _get_metadata_suffix_for_document_index(
|
||||
metadata: dict[str, str | list[str]]
|
||||
) -> str:
|
||||
if not metadata:
|
||||
return ""
|
||||
metadata_str = "Metadata:\n"
|
||||
for key, value in metadata.items():
|
||||
if key in get_metadata_keys_to_ignore():
|
||||
continue
|
||||
|
||||
value_str = ", ".join(value) if isinstance(value, list) else value
|
||||
metadata_str += f"\t{key} - {value_str}\n"
|
||||
return metadata_str.strip()
|
||||
|
||||
|
||||
def chunk_document(
|
||||
document: Document,
|
||||
chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
|
||||
subsection_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
|
||||
) -> list[DocAwareChunk]:
|
||||
title = document.get_title_for_document_index()
|
||||
title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
|
||||
tokenizer = get_default_tokenizer()
|
||||
|
||||
title = document.get_title_for_document_index()
|
||||
title_prefix = f"{title}{RETURN_SEPARATOR}"[:MAX_CHUNK_TITLE_LEN] if title else ""
|
||||
title_tokens = len(tokenizer.tokenize(title_prefix))
|
||||
|
||||
metadata_suffix = ""
|
||||
metadata_tokens = 0
|
||||
if include_metadata:
|
||||
metadata = _get_metadata_suffix_for_document_index(document.metadata)
|
||||
metadata_suffix = RETURN_SEPARATOR + metadata if metadata else ""
|
||||
metadata_tokens = len(tokenizer.tokenize(metadata_suffix))
|
||||
|
||||
if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
|
||||
metadata_suffix = ""
|
||||
metadata_tokens = 0
|
||||
|
||||
content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
|
||||
|
||||
# If there is not enough context remaining then just index the chunk with no prefix/suffix
|
||||
if content_token_limit <= CHUNK_MIN_CONTENT:
|
||||
content_token_limit = chunk_tok_size
|
||||
title_prefix = ""
|
||||
metadata_suffix = ""
|
||||
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
for ind, section in enumerate(document.sections):
|
||||
section_text = title_prefix + section.text if ind == 0 else section.text
|
||||
for section in document.sections:
|
||||
section_text = section.text
|
||||
section_link_text = section.link or ""
|
||||
|
||||
section_tok_length = len(tokenizer.tokenize(section_text))
|
||||
@ -92,16 +145,18 @@ def chunk_document(
|
||||
|
||||
# Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
|
||||
# at the end by other sections
|
||||
if section_tok_length > chunk_tok_size:
|
||||
if section_tok_length > content_token_limit:
|
||||
if chunk_text:
|
||||
chunks.append(
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
||||
content_summary=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
metadata_suffix=metadata_suffix,
|
||||
)
|
||||
)
|
||||
link_offsets = {}
|
||||
@ -113,9 +168,11 @@ def chunk_document(
|
||||
document=document,
|
||||
start_chunk_id=len(chunks),
|
||||
tokenizer=tokenizer,
|
||||
chunk_size=chunk_tok_size,
|
||||
chunk_size=content_token_limit,
|
||||
chunk_overlap=subsection_overlap,
|
||||
blurb_size=blurb_size,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix=metadata_suffix,
|
||||
)
|
||||
chunks.extend(large_section_chunks)
|
||||
continue
|
||||
@ -125,7 +182,7 @@ def chunk_document(
|
||||
current_tok_length
|
||||
+ len(tokenizer.tokenize(SECTION_SEPARATOR))
|
||||
+ section_tok_length
|
||||
<= chunk_tok_size
|
||||
<= content_token_limit
|
||||
):
|
||||
chunk_text += (
|
||||
SECTION_SEPARATOR + section_text if chunk_text else section_text
|
||||
@ -137,9 +194,11 @@ def chunk_document(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
||||
content_summary=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
metadata_suffix=metadata_suffix,
|
||||
)
|
||||
)
|
||||
link_offsets = {0: section_link_text}
|
||||
@ -153,9 +212,11 @@ def chunk_document(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
||||
content_summary=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
metadata_suffix=metadata_suffix,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
@ -164,6 +225,9 @@ def chunk_document(
|
||||
def split_chunk_text_into_mini_chunks(
|
||||
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
|
||||
) -> list[str]:
|
||||
"""The minichunks won't all have the title prefix or metadata suffix
|
||||
It could be a significant percentage of every minichunk so better to not include it
|
||||
"""
|
||||
from llama_index.text_splitter import SentenceSplitter
|
||||
|
||||
token_count_func = get_default_tokenizer().tokenize
|
||||
|
@ -81,7 +81,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
||||
for chunk_ind, chunk in enumerate(chunks):
|
||||
chunk_texts.append(chunk.content)
|
||||
mini_chunk_texts = (
|
||||
split_chunk_text_into_mini_chunks(chunk.content)
|
||||
split_chunk_text_into_mini_chunks(chunk.content_summary)
|
||||
if enable_mini_chunk
|
||||
else []
|
||||
)
|
||||
|
@ -36,6 +36,16 @@ class DocAwareChunk(BaseChunk):
|
||||
# During inference we only have access to the document id and do not reconstruct the Document
|
||||
source_document: Document
|
||||
|
||||
# The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway,
|
||||
# it's easier to just store a not prefixed/suffixed string for the highlighting
|
||||
# Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks
|
||||
content_summary: str
|
||||
|
||||
# During indexing we also (optionally) build a metadata string from the metadata dict
|
||||
# This is also indexed so that we can strip it out after indexing, this way it supports
|
||||
# multiple iterations of metadata representation for backwards compatibility
|
||||
metadata_suffix: str
|
||||
|
||||
def to_short_descriptor(self) -> str:
|
||||
"""Used when logging the identity of a chunk"""
|
||||
return (
|
||||
|
@ -189,6 +189,20 @@ class InferenceChunk(BaseChunk):
|
||||
return self.score > other.score
|
||||
|
||||
|
||||
class InferenceChunkUncleaned(InferenceChunk):
|
||||
title: str # Separate from Semantic Identifier though often same
|
||||
metadata_suffix: str
|
||||
|
||||
def to_inference_chunk(self) -> InferenceChunk:
|
||||
# Create a dict of all fields except 'title' and 'metadata_suffix'
|
||||
inference_chunk_data = {
|
||||
k: v
|
||||
for k, v in self.dict().items()
|
||||
if k not in ["title", "metadata_suffix"]
|
||||
}
|
||||
return InferenceChunk(**inference_chunk_data)
|
||||
|
||||
|
||||
class InferenceSection(BaseModel):
|
||||
"""Section list of chunks with a combined content. A section could be a single chunk, several
|
||||
chunks from the same document or the entire document."""
|
||||
|
@ -4,6 +4,8 @@ from typing import cast
|
||||
|
||||
import numpy
|
||||
|
||||
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
|
||||
from danswer.configs.constants import RETURN_SEPARATOR
|
||||
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
||||
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
|
||||
from danswer.document_index.document_index_utils import (
|
||||
@ -12,6 +14,7 @@ from danswer.document_index.document_index_utils import (
|
||||
from danswer.llm.interfaces import LLM
|
||||
from danswer.search.models import ChunkMetric
|
||||
from danswer.search.models import InferenceChunk
|
||||
from danswer.search.models import InferenceChunkUncleaned
|
||||
from danswer.search.models import InferenceSection
|
||||
from danswer.search.models import MAX_METRICS_CONTENT
|
||||
from danswer.search.models import RerankMetricsContainer
|
||||
@ -47,6 +50,33 @@ def should_apply_llm_based_relevance_filter(query: SearchQuery) -> bool:
|
||||
return not query.skip_llm_chunk_filter
|
||||
|
||||
|
||||
def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk]:
|
||||
def _remove_title(chunk: InferenceChunkUncleaned) -> str:
|
||||
if not chunk.title or not chunk.content:
|
||||
return chunk.content
|
||||
|
||||
if chunk.content.startswith(chunk.title):
|
||||
return chunk.content[len(chunk.title) :].lstrip()
|
||||
|
||||
if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]):
|
||||
return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip()
|
||||
|
||||
return chunk.content
|
||||
|
||||
def _remove_metadata_suffix(chunk: InferenceChunkUncleaned) -> str:
|
||||
if not chunk.metadata_suffix:
|
||||
return chunk.content
|
||||
return chunk.content.removesuffix(chunk.metadata_suffix).rstrip(
|
||||
RETURN_SEPARATOR
|
||||
)
|
||||
|
||||
for chunk in chunks:
|
||||
chunk.content = _remove_title(chunk)
|
||||
chunk.content = _remove_metadata_suffix(chunk)
|
||||
|
||||
return [chunk.to_inference_chunk() for chunk in chunks]
|
||||
|
||||
|
||||
@log_function_time(print_only=True)
|
||||
def semantic_reranking(
|
||||
query: str,
|
||||
|
@ -20,6 +20,7 @@ from danswer.search.models import MAX_METRICS_CONTENT
|
||||
from danswer.search.models import RetrievalMetricsContainer
|
||||
from danswer.search.models import SearchQuery
|
||||
from danswer.search.models import SearchType
|
||||
from danswer.search.postprocessing.postprocessing import cleanup_chunks
|
||||
from danswer.search.search_nlp_models import EmbeddingModel
|
||||
from danswer.search.utils import inference_section_from_chunks
|
||||
from danswer.secondary_llm_flows.query_expansion import multilingual_query_expansion
|
||||
@ -160,7 +161,7 @@ def doc_index_retrieval(
|
||||
else:
|
||||
raise RuntimeError("Invalid Search Flow")
|
||||
|
||||
return top_chunks
|
||||
return cleanup_chunks(top_chunks)
|
||||
|
||||
|
||||
def _simplify_text(text: str) -> str:
|
||||
|
38
backend/tests/unit/danswer/indexing/test_chunker.py
Normal file
38
backend/tests/unit/danswer/indexing/test_chunker.py
Normal file
@ -0,0 +1,38 @@
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.indexing.chunker import chunk_document
|
||||
|
||||
|
||||
def test_chunk_document() -> None:
|
||||
short_section_1 = "This is a short section."
|
||||
long_section = (
|
||||
"This is a long section that should be split into multiple chunks. " * 100
|
||||
)
|
||||
short_section_2 = "This is another short section."
|
||||
short_section_3 = "This is another short section again."
|
||||
short_section_4 = "Final short section."
|
||||
semantic_identifier = "Test Document"
|
||||
|
||||
document = Document(
|
||||
id="test_doc",
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=semantic_identifier,
|
||||
metadata={"tags": ["tag1", "tag2"]},
|
||||
doc_updated_at=None,
|
||||
sections=[
|
||||
Section(text=short_section_1, link="link1"),
|
||||
Section(text=short_section_2, link="link2"),
|
||||
Section(text=long_section, link="link3"),
|
||||
Section(text=short_section_3, link="link4"),
|
||||
Section(text=short_section_4, link="link5"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = chunk_document(document)
|
||||
assert len(chunks) == 5
|
||||
assert all(semantic_identifier in chunk.content for chunk in chunks)
|
||||
assert short_section_1 in chunks[0].content
|
||||
assert short_section_3 in chunks[-1].content
|
||||
assert short_section_4 in chunks[-1].content
|
||||
assert "tag1" in chunks[0].content
|
Loading…
x
Reference in New Issue
Block a user