mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-13 09:30:53 +02:00
Handling Metadata by Vector and Keyword (#1909)
This commit is contained in:
parent
6ee74bd0d1
commit
9651ea828b
@ -44,7 +44,6 @@ QUERY_EVENT_ID = "query_event_id"
|
|||||||
LLM_CHUNKS = "llm_chunks"
|
LLM_CHUNKS = "llm_chunks"
|
||||||
|
|
||||||
# For chunking/processing chunks
|
# For chunking/processing chunks
|
||||||
MAX_CHUNK_TITLE_LEN = 1000
|
|
||||||
RETURN_SEPARATOR = "\n\r\n"
|
RETURN_SEPARATOR = "\n\r\n"
|
||||||
SECTION_SEPARATOR = "\n\n"
|
SECTION_SEPARATOR = "\n\n"
|
||||||
# For combining attributes, doesn't have to be unique/perfect to work
|
# For combining attributes, doesn't have to be unique/perfect to work
|
||||||
|
@ -114,7 +114,9 @@ class DocumentBase(BaseModel):
|
|||||||
title: str | None = None
|
title: str | None = None
|
||||||
from_ingestion_api: bool = False
|
from_ingestion_api: bool = False
|
||||||
|
|
||||||
def get_title_for_document_index(self) -> str | None:
|
def get_title_for_document_index(
|
||||||
|
self,
|
||||||
|
) -> str | None:
|
||||||
# If title is explicitly empty, return a None here for embedding purposes
|
# If title is explicitly empty, return a None here for embedding purposes
|
||||||
if self.title == "":
|
if self.title == "":
|
||||||
return None
|
return None
|
||||||
@ -123,8 +125,6 @@ class DocumentBase(BaseModel):
|
|||||||
for char in replace_chars:
|
for char in replace_chars:
|
||||||
title = title.replace(char, " ")
|
title = title.replace(char, " ")
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
# Title could be quite long here as there is no truncation done
|
|
||||||
# just prior to embedding, it could be truncated
|
|
||||||
return title
|
return title
|
||||||
|
|
||||||
def get_metadata_str_attributes(self) -> list[str] | None:
|
def get_metadata_str_attributes(self) -> list[str] | None:
|
||||||
|
@ -352,11 +352,15 @@ def _index_vespa_chunk(
|
|||||||
BLURB: remove_invalid_unicode_chars(chunk.blurb),
|
BLURB: remove_invalid_unicode_chars(chunk.blurb),
|
||||||
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
||||||
SKIP_TITLE_EMBEDDING: not title,
|
SKIP_TITLE_EMBEDDING: not title,
|
||||||
CONTENT: remove_invalid_unicode_chars(chunk.content),
|
# For the BM25 index, the keyword suffix is used, the vector is already generated with the more
|
||||||
|
# natural language representation of the metadata section
|
||||||
|
CONTENT: remove_invalid_unicode_chars(
|
||||||
|
f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}"
|
||||||
|
),
|
||||||
# This duplication of `content` is needed for keyword highlighting
|
# This duplication of `content` is needed for keyword highlighting
|
||||||
# Note that it's not exactly the same as the actual content
|
# Note that it's not exactly the same as the actual content
|
||||||
# which contains the title prefix and metadata suffix
|
# which contains the title prefix and metadata suffix
|
||||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary),
|
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
|
||||||
SOURCE_TYPE: str(document.source.value),
|
SOURCE_TYPE: str(document.source.value),
|
||||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||||
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
|
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
|
||||||
@ -364,7 +368,7 @@ def _index_vespa_chunk(
|
|||||||
METADATA: json.dumps(document.metadata),
|
METADATA: json.dumps(document.metadata),
|
||||||
# Save as a list for efficient extraction as an Attribute
|
# Save as a list for efficient extraction as an Attribute
|
||||||
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
|
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
|
||||||
METADATA_SUFFIX: chunk.metadata_suffix,
|
METADATA_SUFFIX: chunk.metadata_suffix_keyword,
|
||||||
EMBEDDINGS: embeddings_name_vector_map,
|
EMBEDDINGS: embeddings_name_vector_map,
|
||||||
TITLE_EMBEDDING: chunk.title_embedding,
|
TITLE_EMBEDDING: chunk.title_embedding,
|
||||||
BOOST: chunk.boost,
|
BOOST: chunk.boost,
|
||||||
|
@ -6,7 +6,6 @@ from danswer.configs.app_configs import BLURB_SIZE
|
|||||||
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
||||||
from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
|
from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
|
|
||||||
from danswer.configs.constants import RETURN_SEPARATOR
|
from danswer.configs.constants import RETURN_SEPARATOR
|
||||||
from danswer.configs.constants import SECTION_SEPARATOR
|
from danswer.configs.constants import SECTION_SEPARATOR
|
||||||
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||||
@ -20,7 +19,7 @@ from danswer.utils.logger import setup_logger
|
|||||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from transformers import AutoTokenizer # type:ignore
|
from llama_index.text_splitter import SentenceSplitter # type:ignore
|
||||||
|
|
||||||
|
|
||||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||||
@ -28,6 +27,8 @@ if TYPE_CHECKING:
|
|||||||
CHUNK_OVERLAP = 0
|
CHUNK_OVERLAP = 0
|
||||||
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
|
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
|
||||||
# overwhelm the actual contents of the chunk
|
# overwhelm the actual contents of the chunk
|
||||||
|
# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix
|
||||||
|
# could be another 128 tokens leaving 256 for the actual contents
|
||||||
MAX_METADATA_PERCENTAGE = 0.25
|
MAX_METADATA_PERCENTAGE = 0.25
|
||||||
CHUNK_MIN_CONTENT = 256
|
CHUNK_MIN_CONTENT = 256
|
||||||
|
|
||||||
@ -36,14 +37,7 @@ logger = setup_logger()
|
|||||||
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
||||||
|
|
||||||
|
|
||||||
def extract_blurb(text: str, blurb_size: int) -> str:
|
def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
|
||||||
from llama_index.text_splitter import SentenceSplitter
|
|
||||||
|
|
||||||
token_count_func = get_default_tokenizer().tokenize
|
|
||||||
blurb_splitter = SentenceSplitter(
|
|
||||||
tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0
|
|
||||||
)
|
|
||||||
|
|
||||||
return blurb_splitter.split_text(text)[0]
|
return blurb_splitter.split_text(text)[0]
|
||||||
|
|
||||||
|
|
||||||
@ -52,33 +46,25 @@ def chunk_large_section(
|
|||||||
section_link_text: str,
|
section_link_text: str,
|
||||||
document: Document,
|
document: Document,
|
||||||
start_chunk_id: int,
|
start_chunk_id: int,
|
||||||
tokenizer: "AutoTokenizer",
|
blurb: str,
|
||||||
chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
|
chunk_splitter: "SentenceSplitter",
|
||||||
chunk_overlap: int = CHUNK_OVERLAP,
|
|
||||||
blurb_size: int = BLURB_SIZE,
|
|
||||||
title_prefix: str = "",
|
title_prefix: str = "",
|
||||||
metadata_suffix: str = "",
|
metadata_suffix_semantic: str = "",
|
||||||
|
metadata_suffix_keyword: str = "",
|
||||||
) -> list[DocAwareChunk]:
|
) -> list[DocAwareChunk]:
|
||||||
from llama_index.text_splitter import SentenceSplitter
|
split_texts = chunk_splitter.split_text(section_text)
|
||||||
|
|
||||||
blurb = extract_blurb(section_text, blurb_size)
|
|
||||||
|
|
||||||
sentence_aware_splitter = SentenceSplitter(
|
|
||||||
tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
||||||
)
|
|
||||||
|
|
||||||
split_texts = sentence_aware_splitter.split_text(section_text)
|
|
||||||
|
|
||||||
chunks = [
|
chunks = [
|
||||||
DocAwareChunk(
|
DocAwareChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=start_chunk_id + chunk_ind,
|
chunk_id=start_chunk_id + chunk_ind,
|
||||||
blurb=blurb,
|
blurb=blurb,
|
||||||
content=f"{title_prefix}{chunk_str}{metadata_suffix}",
|
content=chunk_str,
|
||||||
content_summary=chunk_str,
|
|
||||||
source_links={0: section_link_text},
|
source_links={0: section_link_text},
|
||||||
section_continuation=(chunk_ind != 0),
|
section_continuation=(chunk_ind != 0),
|
||||||
metadata_suffix=metadata_suffix,
|
title_prefix=title_prefix,
|
||||||
|
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||||
|
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||||
)
|
)
|
||||||
for chunk_ind, chunk_str in enumerate(split_texts)
|
for chunk_ind, chunk_str in enumerate(split_texts)
|
||||||
]
|
]
|
||||||
@ -86,42 +72,87 @@ def chunk_large_section(
|
|||||||
|
|
||||||
|
|
||||||
def _get_metadata_suffix_for_document_index(
|
def _get_metadata_suffix_for_document_index(
|
||||||
metadata: dict[str, str | list[str]]
|
metadata: dict[str, str | list[str]], include_separator: bool = False
|
||||||
) -> str:
|
) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding
|
||||||
|
and a string of all of the values for the keyword search
|
||||||
|
|
||||||
|
For example, if we have the following metadata:
|
||||||
|
{
|
||||||
|
"author": "John Doe",
|
||||||
|
"space": "Engineering"
|
||||||
|
}
|
||||||
|
The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe
|
||||||
|
and Engineering. The keys are repeat and much more noisy.
|
||||||
|
"""
|
||||||
if not metadata:
|
if not metadata:
|
||||||
return ""
|
return "", ""
|
||||||
|
|
||||||
metadata_str = "Metadata:\n"
|
metadata_str = "Metadata:\n"
|
||||||
|
metadata_values = []
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
if key in get_metadata_keys_to_ignore():
|
if key in get_metadata_keys_to_ignore():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
value_str = ", ".join(value) if isinstance(value, list) else value
|
value_str = ", ".join(value) if isinstance(value, list) else value
|
||||||
|
|
||||||
|
if isinstance(value, list):
|
||||||
|
metadata_values.extend(value)
|
||||||
|
else:
|
||||||
|
metadata_values.append(value)
|
||||||
|
|
||||||
metadata_str += f"\t{key} - {value_str}\n"
|
metadata_str += f"\t{key} - {value_str}\n"
|
||||||
return metadata_str.strip()
|
|
||||||
|
metadata_semantic = metadata_str.strip()
|
||||||
|
metadata_keyword = " ".join(metadata_values)
|
||||||
|
|
||||||
|
if include_separator:
|
||||||
|
return RETURN_SEPARATOR + metadata_semantic, RETURN_SEPARATOR + metadata_keyword
|
||||||
|
return metadata_semantic, metadata_keyword
|
||||||
|
|
||||||
|
|
||||||
def chunk_document(
|
def chunk_document(
|
||||||
document: Document,
|
document: Document,
|
||||||
chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
|
chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
|
||||||
subsection_overlap: int = CHUNK_OVERLAP,
|
subsection_overlap: int = CHUNK_OVERLAP,
|
||||||
blurb_size: int = BLURB_SIZE,
|
blurb_size: int = BLURB_SIZE, # Used for both title and content
|
||||||
include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
|
include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
|
||||||
) -> list[DocAwareChunk]:
|
) -> list[DocAwareChunk]:
|
||||||
|
from llama_index.text_splitter import SentenceSplitter
|
||||||
|
|
||||||
tokenizer = get_default_tokenizer()
|
tokenizer = get_default_tokenizer()
|
||||||
|
|
||||||
title = document.get_title_for_document_index()
|
blurb_splitter = SentenceSplitter(
|
||||||
title_prefix = f"{title[:MAX_CHUNK_TITLE_LEN]}{RETURN_SEPARATOR}" if title else ""
|
tokenizer=tokenizer.tokenize, chunk_size=blurb_size, chunk_overlap=0
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk_splitter = SentenceSplitter(
|
||||||
|
tokenizer=tokenizer.tokenize,
|
||||||
|
chunk_size=chunk_tok_size,
|
||||||
|
chunk_overlap=subsection_overlap,
|
||||||
|
)
|
||||||
|
|
||||||
|
title = extract_blurb(document.get_title_for_document_index() or "", blurb_splitter)
|
||||||
|
title_prefix = title + RETURN_SEPARATOR if title else ""
|
||||||
title_tokens = len(tokenizer.tokenize(title_prefix))
|
title_tokens = len(tokenizer.tokenize(title_prefix))
|
||||||
|
|
||||||
metadata_suffix = ""
|
metadata_suffix_semantic = ""
|
||||||
|
metadata_suffix_keyword = ""
|
||||||
metadata_tokens = 0
|
metadata_tokens = 0
|
||||||
if include_metadata:
|
if include_metadata:
|
||||||
metadata = _get_metadata_suffix_for_document_index(document.metadata)
|
(
|
||||||
metadata_suffix = RETURN_SEPARATOR + metadata if metadata else ""
|
metadata_suffix_semantic,
|
||||||
metadata_tokens = len(tokenizer.tokenize(metadata_suffix))
|
metadata_suffix_keyword,
|
||||||
|
) = _get_metadata_suffix_for_document_index(
|
||||||
|
document.metadata, include_separator=True
|
||||||
|
)
|
||||||
|
metadata_tokens = len(tokenizer.tokenize(metadata_suffix_semantic))
|
||||||
|
|
||||||
if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
|
if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
|
||||||
metadata_suffix = ""
|
# Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model
|
||||||
|
# context, there is no limit for the keyword component
|
||||||
|
metadata_suffix_semantic = ""
|
||||||
metadata_tokens = 0
|
metadata_tokens = 0
|
||||||
|
|
||||||
content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
|
content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
|
||||||
@ -130,7 +161,7 @@ def chunk_document(
|
|||||||
if content_token_limit <= CHUNK_MIN_CONTENT:
|
if content_token_limit <= CHUNK_MIN_CONTENT:
|
||||||
content_token_limit = chunk_tok_size
|
content_token_limit = chunk_tok_size
|
||||||
title_prefix = ""
|
title_prefix = ""
|
||||||
metadata_suffix = ""
|
metadata_suffix_semantic = ""
|
||||||
|
|
||||||
chunks: list[DocAwareChunk] = []
|
chunks: list[DocAwareChunk] = []
|
||||||
link_offsets: dict[int, str] = {}
|
link_offsets: dict[int, str] = {}
|
||||||
@ -151,12 +182,13 @@ def chunk_document(
|
|||||||
DocAwareChunk(
|
DocAwareChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
blurb=extract_blurb(chunk_text, blurb_size),
|
blurb=extract_blurb(chunk_text, blurb_splitter),
|
||||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
content=chunk_text,
|
||||||
content_summary=chunk_text,
|
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
metadata_suffix=metadata_suffix,
|
title_prefix=title_prefix,
|
||||||
|
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||||
|
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
link_offsets = {}
|
link_offsets = {}
|
||||||
@ -167,12 +199,11 @@ def chunk_document(
|
|||||||
section_link_text=section_link_text,
|
section_link_text=section_link_text,
|
||||||
document=document,
|
document=document,
|
||||||
start_chunk_id=len(chunks),
|
start_chunk_id=len(chunks),
|
||||||
tokenizer=tokenizer,
|
chunk_splitter=chunk_splitter,
|
||||||
chunk_size=content_token_limit,
|
blurb=extract_blurb(section_text, blurb_splitter),
|
||||||
chunk_overlap=subsection_overlap,
|
|
||||||
blurb_size=blurb_size,
|
|
||||||
title_prefix=title_prefix,
|
title_prefix=title_prefix,
|
||||||
metadata_suffix=metadata_suffix,
|
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||||
|
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||||
)
|
)
|
||||||
chunks.extend(large_section_chunks)
|
chunks.extend(large_section_chunks)
|
||||||
continue
|
continue
|
||||||
@ -193,12 +224,13 @@ def chunk_document(
|
|||||||
DocAwareChunk(
|
DocAwareChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
blurb=extract_blurb(chunk_text, blurb_size),
|
blurb=extract_blurb(chunk_text, blurb_splitter),
|
||||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
content=chunk_text,
|
||||||
content_summary=chunk_text,
|
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
metadata_suffix=metadata_suffix,
|
title_prefix=title_prefix,
|
||||||
|
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||||
|
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
link_offsets = {0: section_link_text}
|
link_offsets = {0: section_link_text}
|
||||||
@ -211,12 +243,13 @@ def chunk_document(
|
|||||||
DocAwareChunk(
|
DocAwareChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
blurb=extract_blurb(chunk_text, blurb_size),
|
blurb=extract_blurb(chunk_text, blurb_splitter),
|
||||||
content=f"{title_prefix}{chunk_text}{metadata_suffix}",
|
content=chunk_text,
|
||||||
content_summary=chunk_text,
|
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
metadata_suffix=metadata_suffix,
|
title_prefix=title_prefix,
|
||||||
|
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||||
|
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return chunks
|
return chunks
|
||||||
|
@ -81,9 +81,12 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
|||||||
chunk_texts: list[str] = []
|
chunk_texts: list[str] = []
|
||||||
chunk_mini_chunks_count = {}
|
chunk_mini_chunks_count = {}
|
||||||
for chunk_ind, chunk in enumerate(chunks):
|
for chunk_ind, chunk in enumerate(chunks):
|
||||||
chunk_texts.append(chunk.content)
|
# The whole chunk including the prefix/suffix is included in the overall vector representation
|
||||||
|
chunk_texts.append(
|
||||||
|
f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_semantic}"
|
||||||
|
)
|
||||||
mini_chunk_texts = (
|
mini_chunk_texts = (
|
||||||
split_chunk_text_into_mini_chunks(chunk.content_summary)
|
split_chunk_text_into_mini_chunks(chunk.content)
|
||||||
if enable_mini_chunk
|
if enable_mini_chunk
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
|
@ -36,15 +36,13 @@ class DocAwareChunk(BaseChunk):
|
|||||||
# During inference we only have access to the document id and do not reconstruct the Document
|
# During inference we only have access to the document id and do not reconstruct the Document
|
||||||
source_document: Document
|
source_document: Document
|
||||||
|
|
||||||
# The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway,
|
title_prefix: str
|
||||||
# it's easier to just store a not prefixed/suffixed string for the highlighting
|
|
||||||
# Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks
|
|
||||||
content_summary: str
|
|
||||||
|
|
||||||
# During indexing we also (optionally) build a metadata string from the metadata dict
|
# During indexing we also (optionally) build a metadata string from the metadata dict
|
||||||
# This is also indexed so that we can strip it out after indexing, this way it supports
|
# This is also indexed so that we can strip it out after indexing, this way it supports
|
||||||
# multiple iterations of metadata representation for backwards compatibility
|
# multiple iterations of metadata representation for backwards compatibility
|
||||||
metadata_suffix: str
|
metadata_suffix_semantic: str
|
||||||
|
metadata_suffix_keyword: str
|
||||||
|
|
||||||
def to_short_descriptor(self) -> str:
|
def to_short_descriptor(self) -> str:
|
||||||
"""Used when logging the identity of a chunk"""
|
"""Used when logging the identity of a chunk"""
|
||||||
|
@ -4,7 +4,7 @@ from typing import cast
|
|||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
|
from danswer.configs.app_configs import BLURB_SIZE
|
||||||
from danswer.configs.constants import RETURN_SEPARATOR
|
from danswer.configs.constants import RETURN_SEPARATOR
|
||||||
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
||||||
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
|
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
|
||||||
@ -60,8 +60,14 @@ def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk
|
|||||||
if chunk.content.startswith(chunk.title):
|
if chunk.content.startswith(chunk.title):
|
||||||
return chunk.content[len(chunk.title) :].lstrip()
|
return chunk.content[len(chunk.title) :].lstrip()
|
||||||
|
|
||||||
if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]):
|
# BLURB SIZE is by token instead of char but each token is at least 1 char
|
||||||
return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip()
|
# If this prefix matches the content, it's assumed the title was prepended
|
||||||
|
if chunk.content.startswith(chunk.title[:BLURB_SIZE]):
|
||||||
|
return (
|
||||||
|
chunk.content.split(RETURN_SEPARATOR, 1)[-1]
|
||||||
|
if RETURN_SEPARATOR in chunk.content
|
||||||
|
else chunk.content
|
||||||
|
)
|
||||||
|
|
||||||
return chunk.content
|
return chunk.content
|
||||||
|
|
||||||
|
@ -31,8 +31,8 @@ def test_chunk_document() -> None:
|
|||||||
|
|
||||||
chunks = chunk_document(document)
|
chunks = chunk_document(document)
|
||||||
assert len(chunks) == 5
|
assert len(chunks) == 5
|
||||||
assert all(semantic_identifier in chunk.content for chunk in chunks)
|
|
||||||
assert short_section_1 in chunks[0].content
|
assert short_section_1 in chunks[0].content
|
||||||
assert short_section_3 in chunks[-1].content
|
assert short_section_3 in chunks[-1].content
|
||||||
assert short_section_4 in chunks[-1].content
|
assert short_section_4 in chunks[-1].content
|
||||||
assert "tag1" in chunks[0].content
|
assert "tag1" in chunks[0].metadata_suffix_keyword
|
||||||
|
assert "tag2" in chunks[0].metadata_suffix_semantic
|
||||||
|
Loading…
x
Reference in New Issue
Block a user