From cf2bd8a40c769e6e8d5a9370333baa3c620c9b27 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Thu, 7 Sep 2023 15:57:55 -0700
Subject: [PATCH] highlighting

---
 backend/danswer/chunking/models.py            |  7 ++
 backend/danswer/configs/constants.py          |  1 +
 .../vespa/app_config/schemas/danswer_chunk.sd |  7 ++
 .../datastores/vespa/app_config/services.xml  |  6 ++
 backend/danswer/datastores/vespa/store.py     | 71 ++++++++++++-
 backend/danswer/search/semantic_search.py     |  1 +
 backend/danswer/server/models.py              |  4 +
 .../direct_qa/test_question_answer.py         |  2 +
 web/src/components/search/DocumentDisplay.tsx | 99 ++++++++++++++++++-
 .../search/SearchResultsDisplay.tsx           |  2 +-
 web/src/lib/search/interfaces.ts              |  1 +
 11 files changed, 192 insertions(+), 9 deletions(-)
diff --git a/backend/danswer/chunking/models.py b/backend/danswer/chunking/models.py
index 1becd6f56..f9338a8ea 100644
--- a/backend/danswer/chunking/models.py
+++ b/backend/danswer/chunking/models.py
@@ -6,6 +6,7 @@ from typing import cast
 
 from danswer.configs.constants import BLURB
 from danswer.configs.constants import BOOST
+from danswer.configs.constants import MATCH_HIGHLIGHTS
 from danswer.configs.constants import METADATA
 from danswer.configs.constants import SCORE
 from danswer.configs.constants import SEMANTIC_IDENTIFIER
@@ -62,6 +63,10 @@ class InferenceChunk(BaseChunk):
     boost: int
     score: float | None
     metadata: dict[str, Any]
+    # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
+    # to specify that a set of words should be highlighted. For example:
+    # ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
+    match_highlights: list[str]
 
     @classmethod
     def from_dict(cls, init_dict: dict[str, Any]) -> "InferenceChunk":
@@ -85,6 +90,8 @@ class InferenceChunk(BaseChunk):
         init_kwargs[BOOST] = init_kwargs.get(BOOST, 1)
         if SCORE not in init_kwargs:
             init_kwargs[SCORE] = None
+        if MATCH_HIGHLIGHTS not in init_kwargs:
+            init_kwargs[MATCH_HIGHLIGHTS] = []
         if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
             logger.error(
                 f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index 6cd220d92..d99cded4c 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -13,6 +13,7 @@ EMBEDDINGS = "embeddings"
 ALLOWED_USERS = "allowed_users"
 ALLOWED_GROUPS = "allowed_groups"
 METADATA = "metadata"
+MATCH_HIGHLIGHTS = "match_highlights"
 # stored in the `metadata` of a chunk. Used to signify that this chunk should
 # not be used for QA. For example, Google Drive file types which can't be parsed
 # are still useful as a search result but not for QA.
diff --git a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd
index b89d08d98..6d960990a 100644
--- a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd
@@ -21,6 +21,13 @@ schema danswer_chunk {
             }
             index: enable-bm25
         }
+        # duplication of `content` is far from ideal, but is needed for 
+        # non-gram based highlighting for now. If the capability to re-use a 
+        # single field to do both is added, `content_summary` should be removed
+        field content_summary type string {
+            indexing: summary | index
+            summary: dynamic
+        }
         # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
         field source_type type string {
             indexing: summary | attribute
diff --git a/backend/danswer/datastores/vespa/app_config/services.xml b/backend/danswer/datastores/vespa/app_config/services.xml
index ba7a5376e..189b81f20 100644
--- a/backend/danswer/datastores/vespa/app_config/services.xml
+++ b/backend/danswer/datastores/vespa/app_config/services.xml
@@ -25,5 +25,11 @@
                 <disk>0.98</disk>
             </resource-limits>
         </tuning>
+        <config name="vespa.config.search.summary.juniperrc">
+            <max_matches>3</max_matches>
+            <length>750</length>
+            <surround_max>350</surround_max>
+            <min_length>300</min_length>
+        </config>
     </content>
 </services>
diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/datastores/vespa/store.py
index 5d4b3b0a9..2df52a31a 100644
--- a/backend/danswer/datastores/vespa/store.py
+++ b/backend/danswer/datastores/vespa/store.py
@@ -1,4 +1,5 @@
 import json
+import string
 from collections.abc import Mapping
 from typing import Any
 from typing import cast
@@ -25,6 +26,7 @@ from danswer.configs.constants import CONTENT
 from danswer.configs.constants import DEFAULT_BOOST
 from danswer.configs.constants import DOCUMENT_ID
 from danswer.configs.constants import EMBEDDINGS
+from danswer.configs.constants import MATCH_HIGHLIGHTS
 from danswer.configs.constants import METADATA
 from danswer.configs.constants import PUBLIC_DOC_PAT
 from danswer.configs.constants import SCORE
@@ -59,6 +61,8 @@ DOCUMENT_ID_ENDPOINT = (
 )
 SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"
 _BATCH_SIZE = 100  # Specific to Vespa
+# Specific to Vespa, needed for highlighting matching keywords / section
+CONTENT_SUMMARY = "content_summary"
 
 
 def _get_vespa_document_cross_connector_metadata(
@@ -169,7 +173,9 @@ def _index_vespa_chunks(
             DOCUMENT_ID: document.id,
             CHUNK_ID: chunk.chunk_id,
             BLURB: chunk.blurb,
+            # this duplication of `content` is needed for keyword highlighting :(
             CONTENT: chunk.content,
+            CONTENT_SUMMARY: chunk.content,
             SOURCE_TYPE: str(document.source.value),
             SOURCE_LINKS: json.dumps(chunk.source_links),
             SEMANTIC_IDENTIFIER: document.semantic_identifier,
@@ -222,6 +228,9 @@ def _index_vespa_chunks(
             vespa_document_fields[CONTENT] = remove_invalid_unicode_chars(
                 cast(str, vespa_document_fields[CONTENT])
             )
+            vespa_document_fields[CONTENT_SUMMARY] = remove_invalid_unicode_chars(
+                cast(str, vespa_document_fields[CONTENT_SUMMARY])
+            )
             _index_chunk(vespa_url, json_header, vespa_document_fields)
 
         insertion_records.add(
@@ -272,6 +281,30 @@ def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str:
     return f" limit {num_to_retrieve} offset {offset}"
 
 
+def _process_dynamic_summary(
+    dynamic_summary: str, max_summary_length: int = 400
+) -> list[str]:
+    current_length = 0
+    processed_summary: list[str] = []
+    for summary_section in dynamic_summary.split("<sep />"):
+        force_break = False
+
+        # if we're past the desired max length, break at the last word
+        if current_length + len(summary_section) > max_summary_length:
+            summary_section = summary_section[: max_summary_length - current_length]
+            summary_section = summary_section.rsplit(" ", 1)[0]
+            if summary_section[-1] in string.punctuation:
+                summary_section = summary_section[:-1]
+            summary_section += "..."
+            force_break = True
+
+        processed_summary.append(summary_section)
+        current_length += len(summary_section)
+        if current_length >= max_summary_length or force_break:
+            break
+    return processed_summary
+
+
 def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
     if "query" in query_params and not cast(str, query_params["query"]).strip():
         raise ValueError(
@@ -282,7 +315,21 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
 
     hits = response.json()["root"].get("children", [])
     inference_chunks = [
-        InferenceChunk.from_dict(dict(hit["fields"], **{SCORE: hit["relevance"]}))
+        InferenceChunk.from_dict(
+            dict(
+                hit["fields"],
+                **{SCORE: hit["relevance"]},
+                **{
+                    MATCH_HIGHLIGHTS: _process_dynamic_summary(
+                        # fallback to regular `content` if the `content_summary` field
+                        # isn't present
+                        dynamic_summary=hit["fields"].get(
+                            CONTENT_SUMMARY, hit["fields"][CONTENT]
+                        ),
+                    )
+                },
+            )
+        )
         for hit in hits
     ]
 
@@ -303,6 +350,7 @@ class VespaIndex(DocumentIndex):
         f"{SECTION_CONTINUATION}, "
         f"{BOOST}, "
         f"{METADATA} "
+        f"{CONTENT_SUMMARY} "
         f"from {DOCUMENT_INDEX_NAME} where "
     )
 
@@ -389,7 +437,11 @@ class VespaIndex(DocumentIndex):
         yql = (
             VespaIndex.yql_base
             + vespa_where_clauses
-            + '({grammar: "weakAnd"}userInput(@query))'
+            # `({defaultIndex: "content_summary"}userInput(@query))` section is
+            # needed for highlighting while the N-gram highlighting is broken /
+            # not working as desired
+            + '({grammar: "weakAnd"}userInput(@query) '
+            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
             + _build_vespa_limit(num_to_retrieve)
         )
 
@@ -415,7 +467,11 @@ class VespaIndex(DocumentIndex):
         yql = (
             VespaIndex.yql_base
             + vespa_where_clauses
-            + f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding))"
+            + f"(({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) "
+            # `({defaultIndex: "content_summary"}userInput(@query))` section is
+            # needed for highlighting while the N-gram highlighting is broken /
+            # not working as desired
+            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
             + _build_vespa_limit(num_to_retrieve)
         )
 
@@ -423,6 +479,7 @@ class VespaIndex(DocumentIndex):
 
         params = {
             "yql": yql,
+            "query": query,
             "input.query(query_embedding)": str(query_embedding),
             "ranking.profile": "semantic_search",
         }
@@ -440,8 +497,12 @@ class VespaIndex(DocumentIndex):
         yql = (
             VespaIndex.yql_base
             + vespa_where_clauses
-            + f"{{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding) or "
-            + '{grammar: "weakAnd"}userInput(@query)'
+            + f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) or "
+            + '({grammar: "weakAnd"}userInput(@query) '
+            # `({defaultIndex: "content_summary"}userInput(@query))` section is
+            # needed for highlighting while the N-gram highlighting is broken /
+            # not working as desired
+            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
             + _build_vespa_limit(num_to_retrieve)
         )
 
diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py
index f4a332387..433a8268f 100644
--- a/backend/danswer/search/semantic_search.py
+++ b/backend/danswer/search/semantic_search.py
@@ -39,6 +39,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
                 source_type=chunk.source_type,
                 boost=chunk.boost,
                 score=chunk.score,
+                match_highlights=chunk.match_highlights,
             )
             # semantic identifier should always exist but for really old indices, it was not enforced
             for chunk in chunks
diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py
index 36930ffa7..3cc55fd86 100644
--- a/backend/danswer/server/models.py
+++ b/backend/danswer/server/models.py
@@ -129,6 +129,10 @@ class SearchDoc(BaseModel):
     source_type: str
     boost: int
     score: float | None
+    # Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
+    # to specify that a set of words should be highlighted. For example:
+    # ["<hi>the</hi> <hi>answer</hi> is 42", "the answer is <hi>42</hi>""]
+    match_highlights: list[str]
 
 
 class CreateChatID(BaseModel):
diff --git a/backend/tests/unit/qa_service/direct_qa/test_question_answer.py b/backend/tests/unit/qa_service/direct_qa/test_question_answer.py
index c10124da7..32af5ff4d 100644
--- a/backend/tests/unit/qa_service/direct_qa/test_question_answer.py
+++ b/backend/tests/unit/qa_service/direct_qa/test_question_answer.py
@@ -115,6 +115,7 @@ class TestQAPostprocessing(unittest.TestCase):
             boost=0,
             score=1,
             metadata={},
+            match_highlights=[],
         )
         test_chunk_1 = InferenceChunk(
             document_id="test doc 1",
@@ -128,6 +129,7 @@ class TestQAPostprocessing(unittest.TestCase):
             boost=0,
             score=1,
             metadata={},
+            match_highlights=[],
         )
 
         test_quotes = [
diff --git a/web/src/components/search/DocumentDisplay.tsx b/web/src/components/search/DocumentDisplay.tsx
index d7a52e246..4b8466bd4 100644
--- a/web/src/components/search/DocumentDisplay.tsx
+++ b/web/src/components/search/DocumentDisplay.tsx
@@ -4,6 +4,97 @@ import { getSourceIcon } from "../source";
 import { useState } from "react";
 import { PopupSpec } from "../admin/connectors/Popup";
 
+const buildDocumentSummaryDisplay = (
+  matchHighlights: string[],
+  blurb: string
+) => {
+  if (matchHighlights.length === 0) {
+    return blurb;
+  }
+
+  // content, isBold, isContinuation
+  let sections = [] as [string, boolean, boolean][];
+  matchHighlights.forEach((matchHighlight, matchHighlightIndex) => {
+    if (!matchHighlight) {
+      return;
+    }
+
+    const words = matchHighlight.split(new RegExp("\\s"));
+    words.forEach((word) => {
+      if (!word) {
+        return;
+      }
+
+      let isContinuation = false;
+      while (word.includes("<hi>") && word.includes("</hi>")) {
+        const start = word.indexOf("<hi>");
+        const end = word.indexOf("</hi>");
+        const before = word.slice(0, start);
+        const highlight = word.slice(start + 4, end);
+        const after = word.slice(end + 5);
+
+        if (before) {
+          sections.push([before, false, isContinuation]);
+          isContinuation = true;
+        }
+        sections.push([highlight, true, isContinuation]);
+        isContinuation = true;
+        word = after;
+      }
+
+      if (word) {
+        sections.push([word, false, isContinuation]);
+      }
+    });
+    if (matchHighlightIndex != matchHighlights.length - 1) {
+      sections.push(["...", false, false]);
+    }
+  });
+
+  let previousIsBold = sections[0][1];
+  let currentText = "";
+  const finalJSX = [] as (JSX.Element | string)[];
+  sections.forEach(([word, shouldBeBold, isContinuation], index) => {
+    if (shouldBeBold != previousIsBold) {
+      if (currentText) {
+        if (previousIsBold) {
+          // remove leading space so that we don't bold the whitespace
+          // in front of the matching keywords
+          currentText = currentText.trim();
+          finalJSX.push(
+            <b key={index} className="text-gray-200 bg-pink-950">
+              {currentText}
+            </b>
+          );
+        } else {
+          // add in trailing space since the next section is bold
+          // and we will remove any leading spaces when that section is complete
+          finalJSX.push(<span key={index}>{currentText + " "}</span>);
+        }
+      }
+      currentText = "";
+      previousIsBold = shouldBeBold;
+    }
+    if (!isContinuation || index === 0) {
+      currentText += " ";
+    }
+    currentText += word;
+  });
+  if (currentText) {
+    if (previousIsBold) {
+      currentText = currentText.trim();
+      finalJSX.push(
+        <b key={sections.length} className="text-gray-200 bg-pink-950">
+          {currentText}
+        </b>
+      );
+    } else {
+      finalJSX.push(<span key={sections.length}>{currentText}</span>);
+    }
+  }
+  return finalJSX;
+};
+
 interface DocumentDisplayProps {
   document: DanswerDocument;
   queryEventId: number | null;
@@ -53,8 +144,8 @@ export const DocumentDisplay = ({
           target="_blank"
           rel="noopener noreferrer"
         >
-          {getSourceIcon(document.source_type, 20)}
-          <p className="truncate break-all ml-2 my-auto">
+          {getSourceIcon(document.source_type, 22)}
+          <p className="truncate break-all ml-2 my-auto text-base">
             {document.semantic_identifier || document.document_id}
           </p>
         </a>
@@ -68,7 +159,9 @@ export const DocumentDisplay = ({
           )}
         </div>
       </div>
-      <p className="pl-1 pt-2 pb-3 text-gray-200">{document.blurb}</p>
+      <p className="pl-1 pt-2 pb-3 text-gray-200">
+        {buildDocumentSummaryDisplay(document.match_highlights, document.blurb)}
+      </p>
     </div>
   );
 };
diff --git a/web/src/components/search/SearchResultsDisplay.tsx b/web/src/components/search/SearchResultsDisplay.tsx
index fa16b2fb8..23371199c 100644
--- a/web/src/components/search/SearchResultsDisplay.tsx
+++ b/web/src/components/search/SearchResultsDisplay.tsx
@@ -167,7 +167,7 @@ export const SearchResultsDisplay: React.FC<SearchResultsDisplayProps> = ({
 
       {documents && documents.length > 0 && (
         <div className="mt-4">
-          <div className="font-bold border-b mb-3 pb-1 border-gray-800">
+          <div className="font-bold border-b mb-3 pb-1 border-gray-800 text-lg">
             Results
           </div>
           {removeDuplicateDocs(documents).map((document) => (
diff --git a/web/src/lib/search/interfaces.ts b/web/src/lib/search/interfaces.ts
index a718b2ab1..d813f5a65 100644
--- a/web/src/lib/search/interfaces.ts
+++ b/web/src/lib/search/interfaces.ts
@@ -33,6 +33,7 @@ export interface DanswerDocument {
   semantic_identifier: string | null;
   boost: number;
   score: number;
+  match_highlights: string[];
 }
 
 export interface SearchResponse {