Include Titles in Chunks (#1817)

This commit is contained in:
Yuhong Sun 2024-07-12 09:42:24 -07:00 committed by GitHub
parent 8c312482c1
commit e90c66c1b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 224 additions and 63 deletions

View File

@ -10,8 +10,8 @@ import sqlalchemy as sa
revision = "7aea705850d5"
down_revision = "4505fd7302e1"
branch_labels = None
depends_on = None
branch_labels: None = None
depends_on: None = None
def upgrade() -> None:

View File

@ -243,13 +243,15 @@ DISABLE_INDEX_UPDATE_ON_SWAP = (
# fairly large amount of memory in order to increase substantially, since
# each worker loads the embedding models into memory.
NUM_INDEXING_WORKERS = int(os.environ.get("NUM_INDEXING_WORKERS") or 1)
CHUNK_OVERLAP = 0
# More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
# Finer grained chunking for more detail retention
# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
MINI_CHUNK_SIZE = 150
# Include the document level metadata in each chunk. If the metadata is too long, then it is thrown out
# We don't want the metadata to overwhelm the actual contents of the chunk
SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == "true"
# Timeout to wait for job's last update before killing it, in hours
CLEANUP_INDEXING_JOBS_TIMEOUT = int(os.environ.get("CLEANUP_INDEXING_JOBS_TIMEOUT", 3))

View File

@ -19,6 +19,7 @@ DOCUMENT_SETS = "document_sets"
TIME_FILTER = "time_filter"
METADATA = "metadata"
METADATA_LIST = "metadata_list"
METADATA_SUFFIX = "metadata_suffix"
MATCH_HIGHLIGHTS = "match_highlights"
# stored in the `metadata` of a chunk. Used to signify that this chunk should
# not be used for QA. For example, Google Drive file types which can't be parsed
@ -43,7 +44,8 @@ QUERY_EVENT_ID = "query_event_id"
LLM_CHUNKS = "llm_chunks"
# For chunking/processing chunks
TITLE_SEPARATOR = "\n\r\n"
MAX_CHUNK_TITLE_LEN = 1000
RETURN_SEPARATOR = "\n\r\n"
SECTION_SEPARATOR = "\n\n"
# For combining attributes, doesn't have to be unique/perfect to work
INDEX_SEPARATOR = "==="

View File

@ -6,6 +6,7 @@ from typing import TypeVar
from dateutil.parser import parse
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.connectors.models import BasicExpertInfo
from danswer.utils.text_processing import is_valid_email
@ -57,3 +58,7 @@ def process_in_batches(
) -> Iterator[list[U]]:
for i in range(0, len(objects), batch_size):
yield [process_function(obj) for obj in objects[i : i + batch_size]]
def get_metadata_keys_to_ignore() -> list[str]:
return [IGNORE_FOR_QA]

View File

@ -6,6 +6,7 @@ from pydantic import BaseModel
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import INDEX_SEPARATOR
from danswer.configs.constants import RETURN_SEPARATOR
from danswer.utils.text_processing import make_url_compatible
@ -117,7 +118,12 @@ class DocumentBase(BaseModel):
# If title is explicitly empty, return a None here for embedding purposes
if self.title == "":
return None
return self.semantic_identifier if self.title is None else self.title
replace_chars = set(RETURN_SEPARATOR)
title = self.semantic_identifier if self.title is None else self.title
for char in replace_chars:
title = title.replace(char, " ")
title = title.strip()
return title
def get_metadata_str_attributes(self) -> list[str] | None:
if not self.metadata:

View File

@ -6,7 +6,7 @@ from typing import Any
from danswer.access.models import DocumentAccess
from danswer.indexing.models import DocMetadataAwareIndexChunk
from danswer.search.models import IndexFilters
from danswer.search.models import InferenceChunk
from danswer.search.models import InferenceChunkUncleaned
@dataclass(frozen=True)
@ -186,7 +186,7 @@ class IdRetrievalCapable(abc.ABC):
min_chunk_ind: int | None,
max_chunk_ind: int | None,
user_access_control_list: list[str] | None = None,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
"""
Fetch chunk(s) based on document id
@ -222,7 +222,7 @@ class KeywordCapable(abc.ABC):
time_decay_multiplier: float,
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
"""
Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
information required for query time purposes. For example, some details of the document
@ -262,7 +262,7 @@ class VectorCapable(abc.ABC):
time_decay_multiplier: float,
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
"""
Run vector/semantic search and return a list of inference chunks.
@ -298,7 +298,7 @@ class HybridCapable(abc.ABC):
num_to_retrieve: int,
offset: int = 0,
hybrid_alpha: float | None = None,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
"""
Run hybrid search and return a list of inference chunks.
@ -348,7 +348,7 @@ class AdminCapable(abc.ABC):
filters: IndexFilters,
num_to_retrieve: int,
offset: int = 0,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
"""
Run the special search for the admin document explorer page

View File

@ -91,6 +91,9 @@ schema DANSWER_CHUNK_NAME {
field metadata type string {
indexing: summary | attribute
}
field metadata_suffix type string {
indexing: summary | attribute
}
field doc_updated_at type int {
indexing: summary | attribute
}

View File

@ -41,6 +41,7 @@ from danswer.configs.constants import HIDDEN
from danswer.configs.constants import INDEX_SEPARATOR
from danswer.configs.constants import METADATA
from danswer.configs.constants import METADATA_LIST
from danswer.configs.constants import METADATA_SUFFIX
from danswer.configs.constants import PRIMARY_OWNERS
from danswer.configs.constants import RECENCY_BIAS
from danswer.configs.constants import SECONDARY_OWNERS
@ -51,7 +52,6 @@ from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.constants import TITLE_EMBEDDING
from danswer.configs.constants import TITLE_SEPARATOR
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
@ -64,7 +64,7 @@ from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
from danswer.indexing.models import DocMetadataAwareIndexChunk
from danswer.search.models import IndexFilters
from danswer.search.models import InferenceChunk
from danswer.search.models import InferenceChunkUncleaned
from danswer.search.retrieval.search_runner import query_processing
from danswer.search.retrieval.search_runner import remove_stop_words_and_punctuation
from danswer.utils.batching import batch_generator
@ -347,8 +347,10 @@ def _index_vespa_chunk(
TITLE: remove_invalid_unicode_chars(title) if title else None,
SKIP_TITLE_EMBEDDING: not title,
CONTENT: remove_invalid_unicode_chars(chunk.content),
# This duplication of `content` is needed for keyword highlighting :(
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
# This duplication of `content` is needed for keyword highlighting
# Note that it's not exactly the same as the actual content
# which contains the title prefix and metadata suffix
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary),
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
@ -356,6 +358,7 @@ def _index_vespa_chunk(
METADATA: json.dumps(document.metadata),
# Save as a list for efficient extraction as an Attribute
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
METADATA_SUFFIX: chunk.metadata_suffix,
EMBEDDINGS: embeddings_name_vector_map,
TITLE_EMBEDDING: chunk.title_embedding,
BOOST: chunk.boost,
@ -562,7 +565,7 @@ def _process_dynamic_summary(
def _vespa_hit_to_inference_chunk(
hit: dict[str, Any], null_score: bool = False
) -> InferenceChunk:
) -> InferenceChunkUncleaned:
fields = cast(dict[str, Any], hit["fields"])
# parse fields that are stored as strings, but are really json / datetime
@ -585,19 +588,6 @@ def _vespa_hit_to_inference_chunk(
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
)
# Remove the title from the first chunk as every chunk already included
# its semantic identifier for LLM
content = fields[CONTENT]
if fields[CHUNK_ID] == 0:
parts = content.split(TITLE_SEPARATOR, maxsplit=1)
content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
# User ran into this, not sure why this could happen, error checking here
blurb = fields.get(BLURB)
if not blurb:
logger.error(f"Chunk with id {fields.get(semantic_identifier)} ")
blurb = ""
source_links = fields.get(SOURCE_LINKS, {})
source_links_dict_unprocessed = (
json.loads(source_links) if isinstance(source_links, str) else source_links
@ -607,14 +597,15 @@ def _vespa_hit_to_inference_chunk(
for k, v in cast(dict[str, str], source_links_dict_unprocessed).items()
}
return InferenceChunk(
return InferenceChunkUncleaned(
chunk_id=fields[CHUNK_ID],
blurb=blurb,
content=content,
blurb=fields.get(BLURB, ""), # Unused
content=fields[CONTENT], # Includes extra title prefix and metadata suffix
source_links=source_links_dict,
section_continuation=fields[SECTION_CONTINUATION],
document_id=fields[DOCUMENT_ID],
source_type=fields[SOURCE_TYPE],
title=fields[TITLE],
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
boost=fields.get(BOOST, 1),
recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
@ -623,13 +614,16 @@ def _vespa_hit_to_inference_chunk(
primary_owners=fields.get(PRIMARY_OWNERS),
secondary_owners=fields.get(SECONDARY_OWNERS),
metadata=metadata,
metadata_suffix=fields.get(METADATA_SUFFIX) or "",
match_highlights=match_highlights,
updated_at=updated_at,
)
@retry(tries=3, delay=1, backoff=2)
def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[InferenceChunk]:
def _query_vespa(
query_params: Mapping[str, str | int | float]
) -> list[InferenceChunkUncleaned]:
if "query" in query_params and not cast(str, query_params["query"]).strip():
raise ValueError("No/empty query received")
@ -684,16 +678,6 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
return inference_chunks
@retry(tries=3, delay=1, backoff=2)
def _inference_chunk_by_vespa_id(vespa_id: str, index_name: str) -> InferenceChunk:
res = requests.get(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_id}"
)
res.raise_for_status()
return _vespa_hit_to_inference_chunk(res.json())
def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO:
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
@ -738,6 +722,7 @@ class VespaIndex(DocumentIndex):
f"{SOURCE_TYPE}, "
f"{SOURCE_LINKS}, "
f"{SEMANTIC_IDENTIFIER}, "
f"{TITLE}, "
f"{SECTION_CONTINUATION}, "
f"{BOOST}, "
f"{HIDDEN}, "
@ -745,6 +730,7 @@ class VespaIndex(DocumentIndex):
f"{PRIMARY_OWNERS}, "
f"{SECONDARY_OWNERS}, "
f"{METADATA}, "
f"{METADATA_SUFFIX}, "
f"{CONTENT_SUMMARY} "
f"from {{index_name}} where "
)
@ -980,7 +966,7 @@ class VespaIndex(DocumentIndex):
min_chunk_ind: int | None,
max_chunk_ind: int | None,
user_access_control_list: list[str] | None = None,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
document_id = replace_invalid_doc_id_characters(document_id)
vespa_chunks = _get_vespa_chunks_by_document_id(
@ -1009,7 +995,7 @@ class VespaIndex(DocumentIndex):
num_to_retrieve: int = NUM_RETURNED_HITS,
offset: int = 0,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
@ -1046,7 +1032,7 @@ class VespaIndex(DocumentIndex):
offset: int = 0,
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
@ -1090,7 +1076,7 @@ class VespaIndex(DocumentIndex):
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
vespa_where_clauses = _build_vespa_filters(filters)
# Needs to be at least as much as the value set in Vespa schema config
target_hits = max(10 * num_to_retrieve, 1000)
@ -1134,7 +1120,7 @@ class VespaIndex(DocumentIndex):
filters: IndexFilters,
num_to_retrieve: int = NUM_RETURNED_HITS,
offset: int = 0,
) -> list[InferenceChunk]:
) -> list[InferenceChunkUncleaned]:
vespa_where_clauses = _build_vespa_filters(filters, include_hidden=True)
yql = (
VespaIndex.yql_base.format(index_name=self.index_name)

View File

@ -3,12 +3,16 @@ from collections.abc import Callable
from typing import TYPE_CHECKING
from danswer.configs.app_configs import BLURB_SIZE
from danswer.configs.app_configs import CHUNK_OVERLAP
from danswer.configs.app_configs import MINI_CHUNK_SIZE
from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
from danswer.configs.constants import RETURN_SEPARATOR
from danswer.configs.constants import SECTION_SEPARATOR
from danswer.configs.constants import TITLE_SEPARATOR
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_metadata_keys_to_ignore,
)
from danswer.connectors.models import Document
from danswer.indexing.models import DocAwareChunk
from danswer.search.search_nlp_models import get_default_tokenizer
@ -19,6 +23,14 @@ if TYPE_CHECKING:
from transformers import AutoTokenizer # type:ignore
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
# actually help quality at all
CHUNK_OVERLAP = 0
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
# overwhelm the actual contents of the chunk
MAX_METADATA_PERCENTAGE = 0.25
CHUNK_MIN_CONTENT = 256
logger = setup_logger()
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
@ -44,6 +56,8 @@ def chunk_large_section(
chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
chunk_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
title_prefix: str = "",
metadata_suffix: str = "",
) -> list[DocAwareChunk]:
from llama_index.text_splitter import SentenceSplitter
@ -60,30 +74,69 @@ def chunk_large_section(
source_document=document,
chunk_id=start_chunk_id + chunk_ind,
blurb=blurb,
content=chunk_str,
content=f"{title_prefix}{chunk_str}{metadata_suffix}",
content_summary=chunk_str,
source_links={0: section_link_text},
section_continuation=(chunk_ind != 0),
metadata_suffix=metadata_suffix,
)
for chunk_ind, chunk_str in enumerate(split_texts)
]
return chunks
def _get_metadata_suffix_for_document_index(
metadata: dict[str, str | list[str]]
) -> str:
if not metadata:
return ""
metadata_str = "Metadata:\n"
for key, value in metadata.items():
if key in get_metadata_keys_to_ignore():
continue
value_str = ", ".join(value) if isinstance(value, list) else value
metadata_str += f"\t{key} - {value_str}\n"
return metadata_str.strip()
def chunk_document(
document: Document,
chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
subsection_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
) -> list[DocAwareChunk]:
title = document.get_title_for_document_index()
title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
tokenizer = get_default_tokenizer()
title = document.get_title_for_document_index()
title_prefix = f"{title}{RETURN_SEPARATOR}"[:MAX_CHUNK_TITLE_LEN] if title else ""
title_tokens = len(tokenizer.tokenize(title_prefix))
metadata_suffix = ""
metadata_tokens = 0
if include_metadata:
metadata = _get_metadata_suffix_for_document_index(document.metadata)
metadata_suffix = RETURN_SEPARATOR + metadata if metadata else ""
metadata_tokens = len(tokenizer.tokenize(metadata_suffix))
if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
metadata_suffix = ""
metadata_tokens = 0
content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
# If there is not enough context remaining then just index the chunk with no prefix/suffix
if content_token_limit <= CHUNK_MIN_CONTENT:
content_token_limit = chunk_tok_size
title_prefix = ""
metadata_suffix = ""
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
for ind, section in enumerate(document.sections):
section_text = title_prefix + section.text if ind == 0 else section.text
for section in document.sections:
section_text = section.text
section_link_text = section.link or ""
section_tok_length = len(tokenizer.tokenize(section_text))
@ -92,16 +145,18 @@ def chunk_document(
# Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
# at the end by other sections
if section_tok_length > chunk_tok_size:
if section_tok_length > content_token_limit:
if chunk_text:
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
content_summary=chunk_text,
source_links=link_offsets,
section_continuation=False,
metadata_suffix=metadata_suffix,
)
)
link_offsets = {}
@ -113,9 +168,11 @@ def chunk_document(
document=document,
start_chunk_id=len(chunks),
tokenizer=tokenizer,
chunk_size=chunk_tok_size,
chunk_size=content_token_limit,
chunk_overlap=subsection_overlap,
blurb_size=blurb_size,
title_prefix=title_prefix,
metadata_suffix=metadata_suffix,
)
chunks.extend(large_section_chunks)
continue
@ -125,7 +182,7 @@ def chunk_document(
current_tok_length
+ len(tokenizer.tokenize(SECTION_SEPARATOR))
+ section_tok_length
<= chunk_tok_size
<= content_token_limit
):
chunk_text += (
SECTION_SEPARATOR + section_text if chunk_text else section_text
@ -137,9 +194,11 @@ def chunk_document(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
content_summary=chunk_text,
source_links=link_offsets,
section_continuation=False,
metadata_suffix=metadata_suffix,
)
)
link_offsets = {0: section_link_text}
@ -153,9 +212,11 @@ def chunk_document(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
content_summary=chunk_text,
source_links=link_offsets,
section_continuation=False,
metadata_suffix=metadata_suffix,
)
)
return chunks
@ -164,6 +225,9 @@ def chunk_document(
def split_chunk_text_into_mini_chunks(
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
) -> list[str]:
"""The minichunks won't all have the title prefix or metadata suffix
It could be a significant percentage of every minichunk so better to not include it
"""
from llama_index.text_splitter import SentenceSplitter
token_count_func = get_default_tokenizer().tokenize

View File

@ -81,7 +81,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
for chunk_ind, chunk in enumerate(chunks):
chunk_texts.append(chunk.content)
mini_chunk_texts = (
split_chunk_text_into_mini_chunks(chunk.content)
split_chunk_text_into_mini_chunks(chunk.content_summary)
if enable_mini_chunk
else []
)

View File

@ -36,6 +36,16 @@ class DocAwareChunk(BaseChunk):
# During inference we only have access to the document id and do not reconstruct the Document
source_document: Document
# The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway,
# it's easier to just store a not prefixed/suffixed string for the highlighting
# Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks
content_summary: str
# During indexing we also (optionally) build a metadata string from the metadata dict
# This is also indexed so that we can strip it out after indexing, this way it supports
# multiple iterations of metadata representation for backwards compatibility
metadata_suffix: str
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a chunk"""
return (

View File

@ -189,6 +189,20 @@ class InferenceChunk(BaseChunk):
return self.score > other.score
class InferenceChunkUncleaned(InferenceChunk):
title: str # Separate from Semantic Identifier though often same
metadata_suffix: str
def to_inference_chunk(self) -> InferenceChunk:
# Create a dict of all fields except 'title' and 'metadata_suffix'
inference_chunk_data = {
k: v
for k, v in self.dict().items()
if k not in ["title", "metadata_suffix"]
}
return InferenceChunk(**inference_chunk_data)
class InferenceSection(BaseModel):
"""Section list of chunks with a combined content. A section could be a single chunk, several
chunks from the same document or the entire document."""

View File

@ -4,6 +4,8 @@ from typing import cast
import numpy
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
from danswer.configs.constants import RETURN_SEPARATOR
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
from danswer.document_index.document_index_utils import (
@ -12,6 +14,7 @@ from danswer.document_index.document_index_utils import (
from danswer.llm.interfaces import LLM
from danswer.search.models import ChunkMetric
from danswer.search.models import InferenceChunk
from danswer.search.models import InferenceChunkUncleaned
from danswer.search.models import InferenceSection
from danswer.search.models import MAX_METRICS_CONTENT
from danswer.search.models import RerankMetricsContainer
@ -47,6 +50,33 @@ def should_apply_llm_based_relevance_filter(query: SearchQuery) -> bool:
return not query.skip_llm_chunk_filter
def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk]:
def _remove_title(chunk: InferenceChunkUncleaned) -> str:
if not chunk.title or not chunk.content:
return chunk.content
if chunk.content.startswith(chunk.title):
return chunk.content[len(chunk.title) :].lstrip()
if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]):
return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip()
return chunk.content
def _remove_metadata_suffix(chunk: InferenceChunkUncleaned) -> str:
if not chunk.metadata_suffix:
return chunk.content
return chunk.content.removesuffix(chunk.metadata_suffix).rstrip(
RETURN_SEPARATOR
)
for chunk in chunks:
chunk.content = _remove_title(chunk)
chunk.content = _remove_metadata_suffix(chunk)
return [chunk.to_inference_chunk() for chunk in chunks]
@log_function_time(print_only=True)
def semantic_reranking(
query: str,

View File

@ -20,6 +20,7 @@ from danswer.search.models import MAX_METRICS_CONTENT
from danswer.search.models import RetrievalMetricsContainer
from danswer.search.models import SearchQuery
from danswer.search.models import SearchType
from danswer.search.postprocessing.postprocessing import cleanup_chunks
from danswer.search.search_nlp_models import EmbeddingModel
from danswer.search.utils import inference_section_from_chunks
from danswer.secondary_llm_flows.query_expansion import multilingual_query_expansion
@ -160,7 +161,7 @@ def doc_index_retrieval(
else:
raise RuntimeError("Invalid Search Flow")
return top_chunks
return cleanup_chunks(top_chunks)
def _simplify_text(text: str) -> str:

View File

@ -0,0 +1,38 @@
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.indexing.chunker import chunk_document
def test_chunk_document() -> None:
short_section_1 = "This is a short section."
long_section = (
"This is a long section that should be split into multiple chunks. " * 100
)
short_section_2 = "This is another short section."
short_section_3 = "This is another short section again."
short_section_4 = "Final short section."
semantic_identifier = "Test Document"
document = Document(
id="test_doc",
source=DocumentSource.WEB,
semantic_identifier=semantic_identifier,
metadata={"tags": ["tag1", "tag2"]},
doc_updated_at=None,
sections=[
Section(text=short_section_1, link="link1"),
Section(text=short_section_2, link="link2"),
Section(text=long_section, link="link3"),
Section(text=short_section_3, link="link4"),
Section(text=short_section_4, link="link5"),
],
)
chunks = chunk_document(document)
assert len(chunks) == 5
assert all(semantic_identifier in chunk.content for chunk in chunks)
assert short_section_1 in chunks[0].content
assert short_section_3 in chunks[-1].content
assert short_section_4 in chunks[-1].content
assert "tag1" in chunks[0].content