From f802351d85aad67d526ae7fcf1a8ad95ca0b857d Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Fri, 15 Sep 2023 13:03:14 -0700 Subject: [PATCH] Fix Vespa Issue where Documents with no Content could be retrieved via Vector Search (#448) --- backend/danswer/datastores/vespa/store.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/datastores/vespa/store.py index 2df52a31a520..373bb909d638 100644 --- a/backend/danswer/datastores/vespa/store.py +++ b/backend/danswer/datastores/vespa/store.py @@ -314,6 +314,17 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]: response.raise_for_status() hits = response.json()["root"].get("children", []) + + for hit in hits: + if hit["fields"].get(CONTENT) is None: + logger.error( + f"Vespa Index with Vespa ID {hit['id']} has no contents. " + f"This is invalid because the vector is not meaningful and keywordsearch cannot " + f"fetch this document" + ) + + filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None] + inference_chunks = [ InferenceChunk.from_dict( dict( @@ -330,7 +341,7 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]: }, ) ) - for hit in hits + for hit in filtered_hits ] return inference_chunks