From cf2bd8a40c769e6e8d5a9370333baa3c620c9b27 Mon Sep 17 00:00:00 2001 From: Weves Date: Thu, 7 Sep 2023 15:57:55 -0700 Subject: [PATCH] highlighting --- backend/danswer/chunking/models.py | 7 ++ backend/danswer/configs/constants.py | 1 + .../vespa/app_config/schemas/danswer_chunk.sd | 7 ++ .../datastores/vespa/app_config/services.xml | 6 ++ backend/danswer/datastores/vespa/store.py | 71 ++++++++++++- backend/danswer/search/semantic_search.py | 1 + backend/danswer/server/models.py | 4 + .../direct_qa/test_question_answer.py | 2 + web/src/components/search/DocumentDisplay.tsx | 99 ++++++++++++++++++- .../search/SearchResultsDisplay.tsx | 2 +- web/src/lib/search/interfaces.ts | 1 + 11 files changed, 192 insertions(+), 9 deletions(-) diff --git a/backend/danswer/chunking/models.py b/backend/danswer/chunking/models.py index 1becd6f56..f9338a8ea 100644 --- a/backend/danswer/chunking/models.py +++ b/backend/danswer/chunking/models.py @@ -6,6 +6,7 @@ from typing import cast from danswer.configs.constants import BLURB from danswer.configs.constants import BOOST +from danswer.configs.constants import MATCH_HIGHLIGHTS from danswer.configs.constants import METADATA from danswer.configs.constants import SCORE from danswer.configs.constants import SEMANTIC_IDENTIFIER @@ -62,6 +63,10 @@ class InferenceChunk(BaseChunk): boost: int score: float | None metadata: dict[str, Any] + # Matched sections in the chunk. Uses Vespa syntax e.g. TEXT + # to specify that a set of words should be highlighted. For example: + # ["the answer is 42", "he couldn't find an answer"] + match_highlights: list[str] @classmethod def from_dict(cls, init_dict: dict[str, Any]) -> "InferenceChunk": @@ -85,6 +90,8 @@ class InferenceChunk(BaseChunk): init_kwargs[BOOST] = init_kwargs.get(BOOST, 1) if SCORE not in init_kwargs: init_kwargs[SCORE] = None + if MATCH_HIGHLIGHTS not in init_kwargs: + init_kwargs[MATCH_HIGHLIGHTS] = [] if init_kwargs.get(SEMANTIC_IDENTIFIER) is None: logger.error( f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier" diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 6cd220d92..d99cded4c 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -13,6 +13,7 @@ EMBEDDINGS = "embeddings" ALLOWED_USERS = "allowed_users" ALLOWED_GROUPS = "allowed_groups" METADATA = "metadata" +MATCH_HIGHLIGHTS = "match_highlights" # stored in the `metadata` of a chunk. Used to signify that this chunk should # not be used for QA. For example, Google Drive file types which can't be parsed # are still useful as a search result but not for QA. diff --git a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd index b89d08d98..6d960990a 100644 --- a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd @@ -21,6 +21,13 @@ schema danswer_chunk { } index: enable-bm25 } + # duplication of `content` is far from ideal, but is needed for + # non-gram based highlighting for now. If the capability to re-use a + # single field to do both is added, `content_summary` should be removed + field content_summary type string { + indexing: summary | index + summary: dynamic + } # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it field source_type type string { indexing: summary | attribute diff --git a/backend/danswer/datastores/vespa/app_config/services.xml b/backend/danswer/datastores/vespa/app_config/services.xml index ba7a5376e..189b81f20 100644 --- a/backend/danswer/datastores/vespa/app_config/services.xml +++ b/backend/danswer/datastores/vespa/app_config/services.xml @@ -25,5 +25,11 @@ 0.98 + + 3 + 750 + 350 + 300 + diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/datastores/vespa/store.py index 5d4b3b0a9..2df52a31a 100644 --- a/backend/danswer/datastores/vespa/store.py +++ b/backend/danswer/datastores/vespa/store.py @@ -1,4 +1,5 @@ import json +import string from collections.abc import Mapping from typing import Any from typing import cast @@ -25,6 +26,7 @@ from danswer.configs.constants import CONTENT from danswer.configs.constants import DEFAULT_BOOST from danswer.configs.constants import DOCUMENT_ID from danswer.configs.constants import EMBEDDINGS +from danswer.configs.constants import MATCH_HIGHLIGHTS from danswer.configs.constants import METADATA from danswer.configs.constants import PUBLIC_DOC_PAT from danswer.configs.constants import SCORE @@ -59,6 +61,8 @@ DOCUMENT_ID_ENDPOINT = ( ) SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/" _BATCH_SIZE = 100 # Specific to Vespa +# Specific to Vespa, needed for highlighting matching keywords / section +CONTENT_SUMMARY = "content_summary" def _get_vespa_document_cross_connector_metadata( @@ -169,7 +173,9 @@ def _index_vespa_chunks( DOCUMENT_ID: document.id, CHUNK_ID: chunk.chunk_id, BLURB: chunk.blurb, + # this duplication of `content` is needed for keyword highlighting :( CONTENT: chunk.content, + CONTENT_SUMMARY: chunk.content, SOURCE_TYPE: str(document.source.value), SOURCE_LINKS: json.dumps(chunk.source_links), SEMANTIC_IDENTIFIER: document.semantic_identifier, @@ -222,6 +228,9 @@ def _index_vespa_chunks( vespa_document_fields[CONTENT] = remove_invalid_unicode_chars( cast(str, vespa_document_fields[CONTENT]) ) + vespa_document_fields[CONTENT_SUMMARY] = remove_invalid_unicode_chars( + cast(str, vespa_document_fields[CONTENT_SUMMARY]) + ) _index_chunk(vespa_url, json_header, vespa_document_fields) insertion_records.add( @@ -272,6 +281,30 @@ def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str: return f" limit {num_to_retrieve} offset {offset}" +def _process_dynamic_summary( + dynamic_summary: str, max_summary_length: int = 400 +) -> list[str]: + current_length = 0 + processed_summary: list[str] = [] + for summary_section in dynamic_summary.split(""): + force_break = False + + # if we're past the desired max length, break at the last word + if current_length + len(summary_section) > max_summary_length: + summary_section = summary_section[: max_summary_length - current_length] + summary_section = summary_section.rsplit(" ", 1)[0] + if summary_section[-1] in string.punctuation: + summary_section = summary_section[:-1] + summary_section += "..." + force_break = True + + processed_summary.append(summary_section) + current_length += len(summary_section) + if current_length >= max_summary_length or force_break: + break + return processed_summary + + def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]: if "query" in query_params and not cast(str, query_params["query"]).strip(): raise ValueError( @@ -282,7 +315,21 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]: hits = response.json()["root"].get("children", []) inference_chunks = [ - InferenceChunk.from_dict(dict(hit["fields"], **{SCORE: hit["relevance"]})) + InferenceChunk.from_dict( + dict( + hit["fields"], + **{SCORE: hit["relevance"]}, + **{ + MATCH_HIGHLIGHTS: _process_dynamic_summary( + # fallback to regular `content` if the `content_summary` field + # isn't present + dynamic_summary=hit["fields"].get( + CONTENT_SUMMARY, hit["fields"][CONTENT] + ), + ) + }, + ) + ) for hit in hits ] @@ -303,6 +350,7 @@ class VespaIndex(DocumentIndex): f"{SECTION_CONTINUATION}, " f"{BOOST}, " f"{METADATA} " + f"{CONTENT_SUMMARY} " f"from {DOCUMENT_INDEX_NAME} where " ) @@ -389,7 +437,11 @@ class VespaIndex(DocumentIndex): yql = ( VespaIndex.yql_base + vespa_where_clauses - + '({grammar: "weakAnd"}userInput(@query))' + # `({defaultIndex: "content_summary"}userInput(@query))` section is + # needed for highlighting while the N-gram highlighting is broken / + # not working as desired + + '({grammar: "weakAnd"}userInput(@query) ' + + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' + _build_vespa_limit(num_to_retrieve) ) @@ -415,7 +467,11 @@ class VespaIndex(DocumentIndex): yql = ( VespaIndex.yql_base + vespa_where_clauses - + f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding))" + + f"(({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) " + # `({defaultIndex: "content_summary"}userInput(@query))` section is + # needed for highlighting while the N-gram highlighting is broken / + # not working as desired + + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' + _build_vespa_limit(num_to_retrieve) ) @@ -423,6 +479,7 @@ class VespaIndex(DocumentIndex): params = { "yql": yql, + "query": query, "input.query(query_embedding)": str(query_embedding), "ranking.profile": "semantic_search", } @@ -440,8 +497,12 @@ class VespaIndex(DocumentIndex): yql = ( VespaIndex.yql_base + vespa_where_clauses - + f"{{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding) or " - + '{grammar: "weakAnd"}userInput(@query)' + + f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) or " + + '({grammar: "weakAnd"}userInput(@query) ' + # `({defaultIndex: "content_summary"}userInput(@query))` section is + # needed for highlighting while the N-gram highlighting is broken / + # not working as desired + + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' + _build_vespa_limit(num_to_retrieve) ) diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py index f4a332387..433a8268f 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/semantic_search.py @@ -39,6 +39,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc source_type=chunk.source_type, boost=chunk.boost, score=chunk.score, + match_highlights=chunk.match_highlights, ) # semantic identifier should always exist but for really old indices, it was not enforced for chunk in chunks diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index 36930ffa7..3cc55fd86 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -129,6 +129,10 @@ class SearchDoc(BaseModel): source_type: str boost: int score: float | None + # Matched sections in the doc. Uses Vespa syntax e.g. TEXT + # to specify that a set of words should be highlighted. For example: + # ["the answer is 42", "the answer is 42""] + match_highlights: list[str] class CreateChatID(BaseModel): diff --git a/backend/tests/unit/qa_service/direct_qa/test_question_answer.py b/backend/tests/unit/qa_service/direct_qa/test_question_answer.py index c10124da7..32af5ff4d 100644 --- a/backend/tests/unit/qa_service/direct_qa/test_question_answer.py +++ b/backend/tests/unit/qa_service/direct_qa/test_question_answer.py @@ -115,6 +115,7 @@ class TestQAPostprocessing(unittest.TestCase): boost=0, score=1, metadata={}, + match_highlights=[], ) test_chunk_1 = InferenceChunk( document_id="test doc 1", @@ -128,6 +129,7 @@ class TestQAPostprocessing(unittest.TestCase): boost=0, score=1, metadata={}, + match_highlights=[], ) test_quotes = [ diff --git a/web/src/components/search/DocumentDisplay.tsx b/web/src/components/search/DocumentDisplay.tsx index d7a52e246..4b8466bd4 100644 --- a/web/src/components/search/DocumentDisplay.tsx +++ b/web/src/components/search/DocumentDisplay.tsx @@ -4,6 +4,97 @@ import { getSourceIcon } from "../source"; import { useState } from "react"; import { PopupSpec } from "../admin/connectors/Popup"; +const buildDocumentSummaryDisplay = ( + matchHighlights: string[], + blurb: string +) => { + if (matchHighlights.length === 0) { + return blurb; + } + + // content, isBold, isContinuation + let sections = [] as [string, boolean, boolean][]; + matchHighlights.forEach((matchHighlight, matchHighlightIndex) => { + if (!matchHighlight) { + return; + } + + const words = matchHighlight.split(new RegExp("\\s")); + words.forEach((word) => { + if (!word) { + return; + } + + let isContinuation = false; + while (word.includes("") && word.includes("")) { + const start = word.indexOf(""); + const end = word.indexOf(""); + const before = word.slice(0, start); + const highlight = word.slice(start + 4, end); + const after = word.slice(end + 5); + + if (before) { + sections.push([before, false, isContinuation]); + isContinuation = true; + } + sections.push([highlight, true, isContinuation]); + isContinuation = true; + word = after; + } + + if (word) { + sections.push([word, false, isContinuation]); + } + }); + if (matchHighlightIndex != matchHighlights.length - 1) { + sections.push(["...", false, false]); + } + }); + + let previousIsBold = sections[0][1]; + let currentText = ""; + const finalJSX = [] as (JSX.Element | string)[]; + sections.forEach(([word, shouldBeBold, isContinuation], index) => { + if (shouldBeBold != previousIsBold) { + if (currentText) { + if (previousIsBold) { + // remove leading space so that we don't bold the whitespace + // in front of the matching keywords + currentText = currentText.trim(); + finalJSX.push( + + {currentText} + + ); + } else { + // add in trailing space since the next section is bold + // and we will remove any leading spaces when that section is complete + finalJSX.push({currentText + " "}); + } + } + currentText = ""; + previousIsBold = shouldBeBold; + } + if (!isContinuation || index === 0) { + currentText += " "; + } + currentText += word; + }); + if (currentText) { + if (previousIsBold) { + currentText = currentText.trim(); + finalJSX.push( + + {currentText} + + ); + } else { + finalJSX.push({currentText}); + } + } + return finalJSX; +}; + interface DocumentDisplayProps { document: DanswerDocument; queryEventId: number | null; @@ -53,8 +144,8 @@ export const DocumentDisplay = ({ target="_blank" rel="noopener noreferrer" > - {getSourceIcon(document.source_type, 20)} -

+ {getSourceIcon(document.source_type, 22)} +

{document.semantic_identifier || document.document_id}

@@ -68,7 +159,9 @@ export const DocumentDisplay = ({ )} -

{document.blurb}

+

+ {buildDocumentSummaryDisplay(document.match_highlights, document.blurb)} +

); }; diff --git a/web/src/components/search/SearchResultsDisplay.tsx b/web/src/components/search/SearchResultsDisplay.tsx index fa16b2fb8..23371199c 100644 --- a/web/src/components/search/SearchResultsDisplay.tsx +++ b/web/src/components/search/SearchResultsDisplay.tsx @@ -167,7 +167,7 @@ export const SearchResultsDisplay: React.FC = ({ {documents && documents.length > 0 && (
-
+
Results
{removeDuplicateDocs(documents).map((document) => ( diff --git a/web/src/lib/search/interfaces.ts b/web/src/lib/search/interfaces.ts index a718b2ab1..d813f5a65 100644 --- a/web/src/lib/search/interfaces.ts +++ b/web/src/lib/search/interfaces.ts @@ -33,6 +33,7 @@ export interface DanswerDocument { semantic_identifier: string | null; boost: number; score: number; + match_highlights: string[]; } export interface SearchResponse {