From 9651ea828b02f3832f1eeaf3fb6fd032cc839051 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 24 Jul 2024 11:05:56 -0700 Subject: [PATCH] Handling Metadata by Vector and Keyword (#1909) --- backend/danswer/configs/constants.py | 1 - backend/danswer/connectors/models.py | 6 +- backend/danswer/document_index/vespa/index.py | 10 +- backend/danswer/indexing/chunker.py | 147 +++++++++++------- backend/danswer/indexing/embedder.py | 7 +- backend/danswer/indexing/models.py | 8 +- .../search/postprocessing/postprocessing.py | 12 +- .../unit/danswer/indexing/test_chunker.py | 4 +- 8 files changed, 119 insertions(+), 76 deletions(-) diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index fb14bb4b3d..c536d9189d 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -44,7 +44,6 @@ QUERY_EVENT_ID = "query_event_id" LLM_CHUNKS = "llm_chunks" # For chunking/processing chunks -MAX_CHUNK_TITLE_LEN = 1000 RETURN_SEPARATOR = "\n\r\n" SECTION_SEPARATOR = "\n\n" # For combining attributes, doesn't have to be unique/perfect to work diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 11925bdc6a..1f4a1d2ae3 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -114,7 +114,9 @@ class DocumentBase(BaseModel): title: str | None = None from_ingestion_api: bool = False - def get_title_for_document_index(self) -> str | None: + def get_title_for_document_index( + self, + ) -> str | None: # If title is explicitly empty, return a None here for embedding purposes if self.title == "": return None @@ -123,8 +125,6 @@ class DocumentBase(BaseModel): for char in replace_chars: title = title.replace(char, " ") title = title.strip() - # Title could be quite long here as there is no truncation done - # just prior to embedding, it could be truncated return title def get_metadata_str_attributes(self) -> list[str] | None: diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index d5815be601..c238da6e7d 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -352,11 +352,15 @@ def _index_vespa_chunk( BLURB: remove_invalid_unicode_chars(chunk.blurb), TITLE: remove_invalid_unicode_chars(title) if title else None, SKIP_TITLE_EMBEDDING: not title, - CONTENT: remove_invalid_unicode_chars(chunk.content), + # For the BM25 index, the keyword suffix is used, the vector is already generated with the more + # natural language representation of the metadata section + CONTENT: remove_invalid_unicode_chars( + f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}" + ), # This duplication of `content` is needed for keyword highlighting # Note that it's not exactly the same as the actual content # which contains the title prefix and metadata suffix - CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary), + CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content), SOURCE_TYPE: str(document.source.value), SOURCE_LINKS: json.dumps(chunk.source_links), SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier), @@ -364,7 +368,7 @@ def _index_vespa_chunk( METADATA: json.dumps(document.metadata), # Save as a list for efficient extraction as an Attribute METADATA_LIST: chunk.source_document.get_metadata_str_attributes(), - METADATA_SUFFIX: chunk.metadata_suffix, + METADATA_SUFFIX: chunk.metadata_suffix_keyword, EMBEDDINGS: embeddings_name_vector_map, TITLE_EMBEDDING: chunk.title_embedding, BOOST: chunk.boost, diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py index 1b4d65afbc..cddcd0c195 100644 --- a/backend/danswer/indexing/chunker.py +++ b/backend/danswer/indexing/chunker.py @@ -6,7 +6,6 @@ from danswer.configs.app_configs import BLURB_SIZE from danswer.configs.app_configs import MINI_CHUNK_SIZE from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK from danswer.configs.constants import DocumentSource -from danswer.configs.constants import MAX_CHUNK_TITLE_LEN from danswer.configs.constants import RETURN_SEPARATOR from danswer.configs.constants import SECTION_SEPARATOR from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE @@ -20,7 +19,7 @@ from danswer.utils.logger import setup_logger from danswer.utils.text_processing import shared_precompare_cleanup if TYPE_CHECKING: - from transformers import AutoTokenizer # type:ignore + from llama_index.text_splitter import SentenceSplitter # type:ignore # Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps @@ -28,6 +27,8 @@ if TYPE_CHECKING: CHUNK_OVERLAP = 0 # Fairly arbitrary numbers but the general concept is we don't want the title/metadata to # overwhelm the actual contents of the chunk +# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix +# could be another 128 tokens leaving 256 for the actual contents MAX_METADATA_PERCENTAGE = 0.25 CHUNK_MIN_CONTENT = 256 @@ -36,14 +37,7 @@ logger = setup_logger() ChunkFunc = Callable[[Document], list[DocAwareChunk]] -def extract_blurb(text: str, blurb_size: int) -> str: - from llama_index.text_splitter import SentenceSplitter - - token_count_func = get_default_tokenizer().tokenize - blurb_splitter = SentenceSplitter( - tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0 - ) - +def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str: return blurb_splitter.split_text(text)[0] @@ -52,33 +46,25 @@ def chunk_large_section( section_link_text: str, document: Document, start_chunk_id: int, - tokenizer: "AutoTokenizer", - chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE, - chunk_overlap: int = CHUNK_OVERLAP, - blurb_size: int = BLURB_SIZE, + blurb: str, + chunk_splitter: "SentenceSplitter", title_prefix: str = "", - metadata_suffix: str = "", + metadata_suffix_semantic: str = "", + metadata_suffix_keyword: str = "", ) -> list[DocAwareChunk]: - from llama_index.text_splitter import SentenceSplitter - - blurb = extract_blurb(section_text, blurb_size) - - sentence_aware_splitter = SentenceSplitter( - tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - - split_texts = sentence_aware_splitter.split_text(section_text) + split_texts = chunk_splitter.split_text(section_text) chunks = [ DocAwareChunk( source_document=document, chunk_id=start_chunk_id + chunk_ind, blurb=blurb, - content=f"{title_prefix}{chunk_str}{metadata_suffix}", - content_summary=chunk_str, + content=chunk_str, source_links={0: section_link_text}, section_continuation=(chunk_ind != 0), - metadata_suffix=metadata_suffix, + title_prefix=title_prefix, + metadata_suffix_semantic=metadata_suffix_semantic, + metadata_suffix_keyword=metadata_suffix_keyword, ) for chunk_ind, chunk_str in enumerate(split_texts) ] @@ -86,42 +72,87 @@ def chunk_large_section( def _get_metadata_suffix_for_document_index( - metadata: dict[str, str | list[str]] -) -> str: + metadata: dict[str, str | list[str]], include_separator: bool = False +) -> tuple[str, str]: + """ + Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding + and a string of all of the values for the keyword search + + For example, if we have the following metadata: + { + "author": "John Doe", + "space": "Engineering" + } + The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe + and Engineering. The keys are repeat and much more noisy. + """ if not metadata: - return "" + return "", "" + metadata_str = "Metadata:\n" + metadata_values = [] for key, value in metadata.items(): if key in get_metadata_keys_to_ignore(): continue value_str = ", ".join(value) if isinstance(value, list) else value + + if isinstance(value, list): + metadata_values.extend(value) + else: + metadata_values.append(value) + metadata_str += f"\t{key} - {value_str}\n" - return metadata_str.strip() + + metadata_semantic = metadata_str.strip() + metadata_keyword = " ".join(metadata_values) + + if include_separator: + return RETURN_SEPARATOR + metadata_semantic, RETURN_SEPARATOR + metadata_keyword + return metadata_semantic, metadata_keyword def chunk_document( document: Document, chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE, subsection_overlap: int = CHUNK_OVERLAP, - blurb_size: int = BLURB_SIZE, + blurb_size: int = BLURB_SIZE, # Used for both title and content include_metadata: bool = not SKIP_METADATA_IN_CHUNK, ) -> list[DocAwareChunk]: + from llama_index.text_splitter import SentenceSplitter + tokenizer = get_default_tokenizer() - title = document.get_title_for_document_index() - title_prefix = f"{title[:MAX_CHUNK_TITLE_LEN]}{RETURN_SEPARATOR}" if title else "" + blurb_splitter = SentenceSplitter( + tokenizer=tokenizer.tokenize, chunk_size=blurb_size, chunk_overlap=0 + ) + + chunk_splitter = SentenceSplitter( + tokenizer=tokenizer.tokenize, + chunk_size=chunk_tok_size, + chunk_overlap=subsection_overlap, + ) + + title = extract_blurb(document.get_title_for_document_index() or "", blurb_splitter) + title_prefix = title + RETURN_SEPARATOR if title else "" title_tokens = len(tokenizer.tokenize(title_prefix)) - metadata_suffix = "" + metadata_suffix_semantic = "" + metadata_suffix_keyword = "" metadata_tokens = 0 if include_metadata: - metadata = _get_metadata_suffix_for_document_index(document.metadata) - metadata_suffix = RETURN_SEPARATOR + metadata if metadata else "" - metadata_tokens = len(tokenizer.tokenize(metadata_suffix)) + ( + metadata_suffix_semantic, + metadata_suffix_keyword, + ) = _get_metadata_suffix_for_document_index( + document.metadata, include_separator=True + ) + metadata_tokens = len(tokenizer.tokenize(metadata_suffix_semantic)) if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE: - metadata_suffix = "" + # Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model + # context, there is no limit for the keyword component + metadata_suffix_semantic = "" metadata_tokens = 0 content_token_limit = chunk_tok_size - title_tokens - metadata_tokens @@ -130,7 +161,7 @@ def chunk_document( if content_token_limit <= CHUNK_MIN_CONTENT: content_token_limit = chunk_tok_size title_prefix = "" - metadata_suffix = "" + metadata_suffix_semantic = "" chunks: list[DocAwareChunk] = [] link_offsets: dict[int, str] = {} @@ -151,12 +182,13 @@ def chunk_document( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_size), - content=f"{title_prefix}{chunk_text}{metadata_suffix}", - content_summary=chunk_text, + blurb=extract_blurb(chunk_text, blurb_splitter), + content=chunk_text, source_links=link_offsets, section_continuation=False, - metadata_suffix=metadata_suffix, + title_prefix=title_prefix, + metadata_suffix_semantic=metadata_suffix_semantic, + metadata_suffix_keyword=metadata_suffix_keyword, ) ) link_offsets = {} @@ -167,12 +199,11 @@ def chunk_document( section_link_text=section_link_text, document=document, start_chunk_id=len(chunks), - tokenizer=tokenizer, - chunk_size=content_token_limit, - chunk_overlap=subsection_overlap, - blurb_size=blurb_size, + chunk_splitter=chunk_splitter, + blurb=extract_blurb(section_text, blurb_splitter), title_prefix=title_prefix, - metadata_suffix=metadata_suffix, + metadata_suffix_semantic=metadata_suffix_semantic, + metadata_suffix_keyword=metadata_suffix_keyword, ) chunks.extend(large_section_chunks) continue @@ -193,12 +224,13 @@ def chunk_document( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_size), - content=f"{title_prefix}{chunk_text}{metadata_suffix}", - content_summary=chunk_text, + blurb=extract_blurb(chunk_text, blurb_splitter), + content=chunk_text, source_links=link_offsets, section_continuation=False, - metadata_suffix=metadata_suffix, + title_prefix=title_prefix, + metadata_suffix_semantic=metadata_suffix_semantic, + metadata_suffix_keyword=metadata_suffix_keyword, ) ) link_offsets = {0: section_link_text} @@ -211,12 +243,13 @@ def chunk_document( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_size), - content=f"{title_prefix}{chunk_text}{metadata_suffix}", - content_summary=chunk_text, + blurb=extract_blurb(chunk_text, blurb_splitter), + content=chunk_text, source_links=link_offsets, section_continuation=False, - metadata_suffix=metadata_suffix, + title_prefix=title_prefix, + metadata_suffix_semantic=metadata_suffix_semantic, + metadata_suffix_keyword=metadata_suffix_keyword, ) ) return chunks diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py index 3121839832..1244232e43 100644 --- a/backend/danswer/indexing/embedder.py +++ b/backend/danswer/indexing/embedder.py @@ -81,9 +81,12 @@ class DefaultIndexingEmbedder(IndexingEmbedder): chunk_texts: list[str] = [] chunk_mini_chunks_count = {} for chunk_ind, chunk in enumerate(chunks): - chunk_texts.append(chunk.content) + # The whole chunk including the prefix/suffix is included in the overall vector representation + chunk_texts.append( + f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_semantic}" + ) mini_chunk_texts = ( - split_chunk_text_into_mini_chunks(chunk.content_summary) + split_chunk_text_into_mini_chunks(chunk.content) if enable_mini_chunk else [] ) diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py index 44fe2dd5a6..207ea30938 100644 --- a/backend/danswer/indexing/models.py +++ b/backend/danswer/indexing/models.py @@ -36,15 +36,13 @@ class DocAwareChunk(BaseChunk): # During inference we only have access to the document id and do not reconstruct the Document source_document: Document - # The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway, - # it's easier to just store a not prefixed/suffixed string for the highlighting - # Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks - content_summary: str + title_prefix: str # During indexing we also (optionally) build a metadata string from the metadata dict # This is also indexed so that we can strip it out after indexing, this way it supports # multiple iterations of metadata representation for backwards compatibility - metadata_suffix: str + metadata_suffix_semantic: str + metadata_suffix_keyword: str def to_short_descriptor(self) -> str: """Used when logging the identity of a chunk""" diff --git a/backend/danswer/search/postprocessing/postprocessing.py b/backend/danswer/search/postprocessing/postprocessing.py index 2ab5976127..3a13689c93 100644 --- a/backend/danswer/search/postprocessing/postprocessing.py +++ b/backend/danswer/search/postprocessing/postprocessing.py @@ -4,7 +4,7 @@ from typing import cast import numpy -from danswer.configs.constants import MAX_CHUNK_TITLE_LEN +from danswer.configs.app_configs import BLURB_SIZE from danswer.configs.constants import RETURN_SEPARATOR from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN @@ -60,8 +60,14 @@ def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk if chunk.content.startswith(chunk.title): return chunk.content[len(chunk.title) :].lstrip() - if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]): - return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip() + # BLURB SIZE is by token instead of char but each token is at least 1 char + # If this prefix matches the content, it's assumed the title was prepended + if chunk.content.startswith(chunk.title[:BLURB_SIZE]): + return ( + chunk.content.split(RETURN_SEPARATOR, 1)[-1] + if RETURN_SEPARATOR in chunk.content + else chunk.content + ) return chunk.content diff --git a/backend/tests/unit/danswer/indexing/test_chunker.py b/backend/tests/unit/danswer/indexing/test_chunker.py index 1cd6f66f0c..54e902eb08 100644 --- a/backend/tests/unit/danswer/indexing/test_chunker.py +++ b/backend/tests/unit/danswer/indexing/test_chunker.py @@ -31,8 +31,8 @@ def test_chunk_document() -> None: chunks = chunk_document(document) assert len(chunks) == 5 - assert all(semantic_identifier in chunk.content for chunk in chunks) assert short_section_1 in chunks[0].content assert short_section_3 in chunks[-1].content assert short_section_4 in chunks[-1].content - assert "tag1" in chunks[0].content + assert "tag1" in chunks[0].metadata_suffix_keyword + assert "tag2" in chunks[0].metadata_suffix_semantic