Add verbose logging in case of query failure (#1297)

2025-07-12 14:12:53 +02:00 · 2024-04-04 05:30:23 -07:00
parent 4abf5f27a0
commit 7298cc2835
1 changed files with 55 additions and 17 deletions
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@ -112,13 +112,13 @@ def _does_document_exist(
    """Returns whether the document already exists and the users/group whitelists
    Specifically in this case, document refers to a vespa document which is equivalent to a Danswer
    chunk. This checks for whether the chunk exists already in the index"""
-    doc_fetch_response = http_client.get(
-        f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
-    )
+    doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
+    doc_fetch_response = http_client.get(doc_url)
    if doc_fetch_response.status_code == 404:
        return False

    if doc_fetch_response.status_code != 200:
+        logger.debug(f"Failed to check for document with URL {doc_url}")
        raise RuntimeError(
            f"Unexpected fetch document by ID value from Vespa "
            f"with error {doc_fetch_response.status_code}"
@ -157,7 +157,24 @@ def _get_vespa_chunk_ids_by_document_id(
        "hits": hits_per_page,
    }
    while True:
-        results = requests.post(SEARCH_ENDPOINT, json=params).json()
+        res = requests.post(SEARCH_ENDPOINT, json=params)
+        try:
+            res.raise_for_status()
+        except requests.HTTPError as e:
+            request_info = f"Headers: {res.request.headers}\nPayload: {params}"
+            response_info = (
+                f"Status Code: {res.status_code}\nResponse Content: {res.text}"
+            )
+            error_base = f"Error occurred getting chunk by Document ID {document_id}"
+            logger.error(
+                f"{error_base}:\n"
+                f"{request_info}\n"
+                f"{response_info}\n"
+                f"Exception: {e}"
+            )
+            raise requests.HTTPError(error_base) from e
+
+        results = res.json()
        hits = results["root"].get("children", [])

        doc_chunk_ids.extend(
@ -179,10 +196,14 @@ def _delete_vespa_doc_chunks(
    )

    for chunk_id in doc_chunk_ids:
-        res = http_client.delete(
-            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{chunk_id}"
-        )
-        res.raise_for_status()
+        try:
+            res = http_client.delete(
+                f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{chunk_id}"
+            )
+            res.raise_for_status()
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Failed to delete chunk, details: {e.response.text}")
+            raise


 def _delete_vespa_docs(
@ -559,18 +580,35 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
    if "query" in query_params and not cast(str, query_params["query"]).strip():
        raise ValueError("No/empty query received")

+    params = dict(
+        **query_params,
+        **{
+            "presentation.timing": True,
+        }
+        if LOG_VESPA_TIMING_INFORMATION
+        else {},
+    )
+
    response = requests.post(
        SEARCH_ENDPOINT,
-        json=dict(
-            **query_params,
-            **{
-                "presentation.timing": True,
-            }
-            if LOG_VESPA_TIMING_INFORMATION
-            else {},
-        ),
+        json=params,
    )
-    response.raise_for_status()
+    try:
+        response.raise_for_status()
+    except requests.HTTPError as e:
+        request_info = f"Headers: {response.request.headers}\nPayload: {params}"
+        response_info = (
+            f"Status Code: {response.status_code}\n"
+            f"Response Content: {response.text}"
+        )
+        error_base = "Failed to query Vespa"
+        logger.error(
+            f"{error_base}:\n"
+            f"{request_info}\n"
+            f"{response_info}\n"
+            f"Exception: {e}"
+        )
+        raise requests.HTTPError(error_base) from e

    response_json: dict[str, Any] = response.json()
    if LOG_VESPA_TIMING_INFORMATION: