Improve Vespa invalid char cleanup

2025-06-26 16:01:09 +02:00 · 2023-12-29 20:24:44 -08:00 · 2023-12-29 20:24:44 -08:00 · 6004e540f3
commit 6004e540f3
parent 64d2cea396
1 changed files with 14 additions and 58 deletions
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@ -13,8 +13,6 @@ from typing import cast
 import httpx
 import requests
 from requests import HTTPError
 from requests import Response
 from retry import retry
 from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
@ -244,14 +242,14 @@ def _index_vespa_chunk(
    vespa_document_fields = {
        DOCUMENT_ID: document.id,
        CHUNK_ID: chunk.chunk_id,
-        BLURB: chunk.blurb,
+        BLURB: remove_invalid_unicode_chars(chunk.blurb),
        # this duplication of `content` is needed for keyword highlighting :(
-        CONTENT: chunk.content,
+        CONTENT: remove_invalid_unicode_chars(chunk.content),
-        CONTENT_SUMMARY: chunk.content,
+        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
        SOURCE_TYPE: str(document.source.value),
        SOURCE_LINKS: json.dumps(chunk.source_links),
-        SEMANTIC_IDENTIFIER: document.semantic_identifier,
+        SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
-        TITLE: document.get_title_for_document_index(),
+        TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
        SECTION_CONTINUATION: chunk.section_continuation,
        METADATA: json.dumps(document.metadata),
        EMBEDDINGS: embeddings_name_vector_map,
@ -265,60 +263,18 @@ def _index_vespa_chunk(
        DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
    }
    def _index_chunk(
        url: str,
        headers: dict[str, str],
        fields: dict[str, Any],
        log_error: bool = True,
    ) -> httpx.Response:
        logger.debug(f'Indexing to URL "{url}"')
        res = http_client.post(url, headers=headers, json={"fields": fields})
        try:
            res.raise_for_status()
            return res
        except Exception as e:
            if log_error:
                logger.error(
                    f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
                )
            raise e
    vespa_url = f"{DOCUMENT_ID_ENDPOINT}/{vespa_chunk_id}"
    logger.debug(f'Indexing to URL "{vespa_url}"')
    res = http_client.post(
        vespa_url, headers=json_header, json={"fields": vespa_document_fields}
    )
    try:
-        _index_chunk(
+        res.raise_for_status()
-            url=vespa_url,
+    except Exception as e:
-            headers=json_header,
+        logger.exception(
-            fields=vespa_document_fields,
+            f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
            log_error=False,
        )
    except HTTPError as e:
        if cast(Response, e.response).status_code != 400:
            raise e
        # if it's a 400 response, try again with invalid unicode chars removed
        # only doing this on error to avoid having to go through the content
        # char by char every time
        vespa_document_fields[BLURB] = remove_invalid_unicode_chars(
            cast(str, vespa_document_fields[BLURB])
        )
        vespa_document_fields[SEMANTIC_IDENTIFIER] = remove_invalid_unicode_chars(
            cast(str, vespa_document_fields[SEMANTIC_IDENTIFIER])
        )
        vespa_document_fields[TITLE] = remove_invalid_unicode_chars(
            cast(str, vespa_document_fields[TITLE])
        )
        vespa_document_fields[CONTENT] = remove_invalid_unicode_chars(
            cast(str, vespa_document_fields[CONTENT])
        )
        vespa_document_fields[CONTENT_SUMMARY] = remove_invalid_unicode_chars(
            cast(str, vespa_document_fields[CONTENT_SUMMARY])
        )
        _index_chunk(
            url=vespa_url,
            headers=json_header,
            fields=vespa_document_fields,
            log_error=True,
        )
        raise e
 def _batch_index_vespa_chunks(