mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-07 19:38:19 +02:00
highlighting
This commit is contained in:
parent
b5fc2a5775
commit
cf2bd8a40c
@ -6,6 +6,7 @@ from typing import cast
|
||||
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import BOOST
|
||||
from danswer.configs.constants import MATCH_HIGHLIGHTS
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import SCORE
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
@ -62,6 +63,10 @@ class InferenceChunk(BaseChunk):
|
||||
boost: int
|
||||
score: float | None
|
||||
metadata: dict[str, Any]
|
||||
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
# ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
|
||||
match_highlights: list[str]
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, init_dict: dict[str, Any]) -> "InferenceChunk":
|
||||
@ -85,6 +90,8 @@ class InferenceChunk(BaseChunk):
|
||||
init_kwargs[BOOST] = init_kwargs.get(BOOST, 1)
|
||||
if SCORE not in init_kwargs:
|
||||
init_kwargs[SCORE] = None
|
||||
if MATCH_HIGHLIGHTS not in init_kwargs:
|
||||
init_kwargs[MATCH_HIGHLIGHTS] = []
|
||||
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
|
||||
logger.error(
|
||||
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||
|
@ -13,6 +13,7 @@ EMBEDDINGS = "embeddings"
|
||||
ALLOWED_USERS = "allowed_users"
|
||||
ALLOWED_GROUPS = "allowed_groups"
|
||||
METADATA = "metadata"
|
||||
MATCH_HIGHLIGHTS = "match_highlights"
|
||||
# stored in the `metadata` of a chunk. Used to signify that this chunk should
|
||||
# not be used for QA. For example, Google Drive file types which can't be parsed
|
||||
# are still useful as a search result but not for QA.
|
||||
|
@ -21,6 +21,13 @@ schema danswer_chunk {
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
# duplication of `content` is far from ideal, but is needed for
|
||||
# non-gram based highlighting for now. If the capability to re-use a
|
||||
# single field to do both is added, `content_summary` should be removed
|
||||
field content_summary type string {
|
||||
indexing: summary | index
|
||||
summary: dynamic
|
||||
}
|
||||
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
|
||||
field source_type type string {
|
||||
indexing: summary | attribute
|
||||
|
@ -25,5 +25,11 @@
|
||||
<disk>0.98</disk>
|
||||
</resource-limits>
|
||||
</tuning>
|
||||
<config name="vespa.config.search.summary.juniperrc">
|
||||
<max_matches>3</max_matches>
|
||||
<length>750</length>
|
||||
<surround_max>350</surround_max>
|
||||
<min_length>300</min_length>
|
||||
</config>
|
||||
</content>
|
||||
</services>
|
||||
|
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import string
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
@ -25,6 +26,7 @@ from danswer.configs.constants import CONTENT
|
||||
from danswer.configs.constants import DEFAULT_BOOST
|
||||
from danswer.configs.constants import DOCUMENT_ID
|
||||
from danswer.configs.constants import EMBEDDINGS
|
||||
from danswer.configs.constants import MATCH_HIGHLIGHTS
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import PUBLIC_DOC_PAT
|
||||
from danswer.configs.constants import SCORE
|
||||
@ -59,6 +61,8 @@ DOCUMENT_ID_ENDPOINT = (
|
||||
)
|
||||
SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"
|
||||
_BATCH_SIZE = 100 # Specific to Vespa
|
||||
# Specific to Vespa, needed for highlighting matching keywords / section
|
||||
CONTENT_SUMMARY = "content_summary"
|
||||
|
||||
|
||||
def _get_vespa_document_cross_connector_metadata(
|
||||
@ -169,7 +173,9 @@ def _index_vespa_chunks(
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
BLURB: chunk.blurb,
|
||||
# this duplication of `content` is needed for keyword highlighting :(
|
||||
CONTENT: chunk.content,
|
||||
CONTENT_SUMMARY: chunk.content,
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||
SEMANTIC_IDENTIFIER: document.semantic_identifier,
|
||||
@ -222,6 +228,9 @@ def _index_vespa_chunks(
|
||||
vespa_document_fields[CONTENT] = remove_invalid_unicode_chars(
|
||||
cast(str, vespa_document_fields[CONTENT])
|
||||
)
|
||||
vespa_document_fields[CONTENT_SUMMARY] = remove_invalid_unicode_chars(
|
||||
cast(str, vespa_document_fields[CONTENT_SUMMARY])
|
||||
)
|
||||
_index_chunk(vespa_url, json_header, vespa_document_fields)
|
||||
|
||||
insertion_records.add(
|
||||
@ -272,6 +281,30 @@ def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str:
|
||||
return f" limit {num_to_retrieve} offset {offset}"
|
||||
|
||||
|
||||
def _process_dynamic_summary(
|
||||
dynamic_summary: str, max_summary_length: int = 400
|
||||
) -> list[str]:
|
||||
current_length = 0
|
||||
processed_summary: list[str] = []
|
||||
for summary_section in dynamic_summary.split("<sep />"):
|
||||
force_break = False
|
||||
|
||||
# if we're past the desired max length, break at the last word
|
||||
if current_length + len(summary_section) > max_summary_length:
|
||||
summary_section = summary_section[: max_summary_length - current_length]
|
||||
summary_section = summary_section.rsplit(" ", 1)[0]
|
||||
if summary_section[-1] in string.punctuation:
|
||||
summary_section = summary_section[:-1]
|
||||
summary_section += "..."
|
||||
force_break = True
|
||||
|
||||
processed_summary.append(summary_section)
|
||||
current_length += len(summary_section)
|
||||
if current_length >= max_summary_length or force_break:
|
||||
break
|
||||
return processed_summary
|
||||
|
||||
|
||||
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError(
|
||||
@ -282,7 +315,21 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
|
||||
hits = response.json()["root"].get("children", [])
|
||||
inference_chunks = [
|
||||
InferenceChunk.from_dict(dict(hit["fields"], **{SCORE: hit["relevance"]}))
|
||||
InferenceChunk.from_dict(
|
||||
dict(
|
||||
hit["fields"],
|
||||
**{SCORE: hit["relevance"]},
|
||||
**{
|
||||
MATCH_HIGHLIGHTS: _process_dynamic_summary(
|
||||
# fallback to regular `content` if the `content_summary` field
|
||||
# isn't present
|
||||
dynamic_summary=hit["fields"].get(
|
||||
CONTENT_SUMMARY, hit["fields"][CONTENT]
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
)
|
||||
for hit in hits
|
||||
]
|
||||
|
||||
@ -303,6 +350,7 @@ class VespaIndex(DocumentIndex):
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{BOOST}, "
|
||||
f"{METADATA} "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {DOCUMENT_INDEX_NAME} where "
|
||||
)
|
||||
|
||||
@ -389,7 +437,11 @@ class VespaIndex(DocumentIndex):
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
+ vespa_where_clauses
|
||||
+ '({grammar: "weakAnd"}userInput(@query))'
|
||||
# `({defaultIndex: "content_summary"}userInput(@query))` section is
|
||||
# needed for highlighting while the N-gram highlighting is broken /
|
||||
# not working as desired
|
||||
+ '({grammar: "weakAnd"}userInput(@query) '
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
+ _build_vespa_limit(num_to_retrieve)
|
||||
)
|
||||
|
||||
@ -415,7 +467,11 @@ class VespaIndex(DocumentIndex):
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
+ vespa_where_clauses
|
||||
+ f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding))"
|
||||
+ f"(({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) "
|
||||
# `({defaultIndex: "content_summary"}userInput(@query))` section is
|
||||
# needed for highlighting while the N-gram highlighting is broken /
|
||||
# not working as desired
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
+ _build_vespa_limit(num_to_retrieve)
|
||||
)
|
||||
|
||||
@ -423,6 +479,7 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
params = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"ranking.profile": "semantic_search",
|
||||
}
|
||||
@ -440,8 +497,12 @@ class VespaIndex(DocumentIndex):
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
+ vespa_where_clauses
|
||||
+ f"{{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding) or "
|
||||
+ '{grammar: "weakAnd"}userInput(@query)'
|
||||
+ f"({{targetHits: {10 * num_to_retrieve}}}nearestNeighbor(embeddings, query_embedding)) or "
|
||||
+ '({grammar: "weakAnd"}userInput(@query) '
|
||||
# `({defaultIndex: "content_summary"}userInput(@query))` section is
|
||||
# needed for highlighting while the N-gram highlighting is broken /
|
||||
# not working as desired
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
+ _build_vespa_limit(num_to_retrieve)
|
||||
)
|
||||
|
||||
|
@ -39,6 +39,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
|
||||
source_type=chunk.source_type,
|
||||
boost=chunk.boost,
|
||||
score=chunk.score,
|
||||
match_highlights=chunk.match_highlights,
|
||||
)
|
||||
# semantic identifier should always exist but for really old indices, it was not enforced
|
||||
for chunk in chunks
|
||||
|
@ -129,6 +129,10 @@ class SearchDoc(BaseModel):
|
||||
source_type: str
|
||||
boost: int
|
||||
score: float | None
|
||||
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
# ["<hi>the</hi> <hi>answer</hi> is 42", "the answer is <hi>42</hi>""]
|
||||
match_highlights: list[str]
|
||||
|
||||
|
||||
class CreateChatID(BaseModel):
|
||||
|
@ -115,6 +115,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
boost=0,
|
||||
score=1,
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
)
|
||||
test_chunk_1 = InferenceChunk(
|
||||
document_id="test doc 1",
|
||||
@ -128,6 +129,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
boost=0,
|
||||
score=1,
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
)
|
||||
|
||||
test_quotes = [
|
||||
|
@ -4,6 +4,97 @@ import { getSourceIcon } from "../source";
|
||||
import { useState } from "react";
|
||||
import { PopupSpec } from "../admin/connectors/Popup";
|
||||
|
||||
const buildDocumentSummaryDisplay = (
|
||||
matchHighlights: string[],
|
||||
blurb: string
|
||||
) => {
|
||||
if (matchHighlights.length === 0) {
|
||||
return blurb;
|
||||
}
|
||||
|
||||
// content, isBold, isContinuation
|
||||
let sections = [] as [string, boolean, boolean][];
|
||||
matchHighlights.forEach((matchHighlight, matchHighlightIndex) => {
|
||||
if (!matchHighlight) {
|
||||
return;
|
||||
}
|
||||
|
||||
const words = matchHighlight.split(new RegExp("\\s"));
|
||||
words.forEach((word) => {
|
||||
if (!word) {
|
||||
return;
|
||||
}
|
||||
|
||||
let isContinuation = false;
|
||||
while (word.includes("<hi>") && word.includes("</hi>")) {
|
||||
const start = word.indexOf("<hi>");
|
||||
const end = word.indexOf("</hi>");
|
||||
const before = word.slice(0, start);
|
||||
const highlight = word.slice(start + 4, end);
|
||||
const after = word.slice(end + 5);
|
||||
|
||||
if (before) {
|
||||
sections.push([before, false, isContinuation]);
|
||||
isContinuation = true;
|
||||
}
|
||||
sections.push([highlight, true, isContinuation]);
|
||||
isContinuation = true;
|
||||
word = after;
|
||||
}
|
||||
|
||||
if (word) {
|
||||
sections.push([word, false, isContinuation]);
|
||||
}
|
||||
});
|
||||
if (matchHighlightIndex != matchHighlights.length - 1) {
|
||||
sections.push(["...", false, false]);
|
||||
}
|
||||
});
|
||||
|
||||
let previousIsBold = sections[0][1];
|
||||
let currentText = "";
|
||||
const finalJSX = [] as (JSX.Element | string)[];
|
||||
sections.forEach(([word, shouldBeBold, isContinuation], index) => {
|
||||
if (shouldBeBold != previousIsBold) {
|
||||
if (currentText) {
|
||||
if (previousIsBold) {
|
||||
// remove leading space so that we don't bold the whitespace
|
||||
// in front of the matching keywords
|
||||
currentText = currentText.trim();
|
||||
finalJSX.push(
|
||||
<b key={index} className="text-gray-200 bg-pink-950">
|
||||
{currentText}
|
||||
</b>
|
||||
);
|
||||
} else {
|
||||
// add in trailing space since the next section is bold
|
||||
// and we will remove any leading spaces when that section is complete
|
||||
finalJSX.push(<span key={index}>{currentText + " "}</span>);
|
||||
}
|
||||
}
|
||||
currentText = "";
|
||||
previousIsBold = shouldBeBold;
|
||||
}
|
||||
if (!isContinuation || index === 0) {
|
||||
currentText += " ";
|
||||
}
|
||||
currentText += word;
|
||||
});
|
||||
if (currentText) {
|
||||
if (previousIsBold) {
|
||||
currentText = currentText.trim();
|
||||
finalJSX.push(
|
||||
<b key={sections.length} className="text-gray-200 bg-pink-950">
|
||||
{currentText}
|
||||
</b>
|
||||
);
|
||||
} else {
|
||||
finalJSX.push(<span key={sections.length}>{currentText}</span>);
|
||||
}
|
||||
}
|
||||
return finalJSX;
|
||||
};
|
||||
|
||||
interface DocumentDisplayProps {
|
||||
document: DanswerDocument;
|
||||
queryEventId: number | null;
|
||||
@ -53,8 +144,8 @@ export const DocumentDisplay = ({
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
>
|
||||
{getSourceIcon(document.source_type, 20)}
|
||||
<p className="truncate break-all ml-2 my-auto">
|
||||
{getSourceIcon(document.source_type, 22)}
|
||||
<p className="truncate break-all ml-2 my-auto text-base">
|
||||
{document.semantic_identifier || document.document_id}
|
||||
</p>
|
||||
</a>
|
||||
@ -68,7 +159,9 @@ export const DocumentDisplay = ({
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<p className="pl-1 pt-2 pb-3 text-gray-200">{document.blurb}</p>
|
||||
<p className="pl-1 pt-2 pb-3 text-gray-200">
|
||||
{buildDocumentSummaryDisplay(document.match_highlights, document.blurb)}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
@ -167,7 +167,7 @@ export const SearchResultsDisplay: React.FC<SearchResultsDisplayProps> = ({
|
||||
|
||||
{documents && documents.length > 0 && (
|
||||
<div className="mt-4">
|
||||
<div className="font-bold border-b mb-3 pb-1 border-gray-800">
|
||||
<div className="font-bold border-b mb-3 pb-1 border-gray-800 text-lg">
|
||||
Results
|
||||
</div>
|
||||
{removeDuplicateDocs(documents).map((document) => (
|
||||
|
@ -33,6 +33,7 @@ export interface DanswerDocument {
|
||||
semantic_identifier: string | null;
|
||||
boost: number;
|
||||
score: number;
|
||||
match_highlights: string[];
|
||||
}
|
||||
|
||||
export interface SearchResponse {
|
||||
|
Loading…
x
Reference in New Issue
Block a user