Improve Vespa invalid char cleanup

This commit is contained in:
Weves 2023-12-29 20:24:44 -08:00 committed by Chris Weaver
parent 64d2cea396
commit 6004e540f3

View File

@ -13,8 +13,6 @@ from typing import cast
import httpx import httpx
import requests import requests
from requests import HTTPError
from requests import Response
from retry import retry from retry import retry
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
@ -244,14 +242,14 @@ def _index_vespa_chunk(
vespa_document_fields = { vespa_document_fields = {
DOCUMENT_ID: document.id, DOCUMENT_ID: document.id,
CHUNK_ID: chunk.chunk_id, CHUNK_ID: chunk.chunk_id,
BLURB: chunk.blurb, BLURB: remove_invalid_unicode_chars(chunk.blurb),
# this duplication of `content` is needed for keyword highlighting :( # this duplication of `content` is needed for keyword highlighting :(
CONTENT: chunk.content, CONTENT: remove_invalid_unicode_chars(chunk.content),
CONTENT_SUMMARY: chunk.content, CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
SOURCE_TYPE: str(document.source.value), SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links), SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: document.semantic_identifier, SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
TITLE: document.get_title_for_document_index(), TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
SECTION_CONTINUATION: chunk.section_continuation, SECTION_CONTINUATION: chunk.section_continuation,
METADATA: json.dumps(document.metadata), METADATA: json.dumps(document.metadata),
EMBEDDINGS: embeddings_name_vector_map, EMBEDDINGS: embeddings_name_vector_map,
@ -265,60 +263,18 @@ def _index_vespa_chunk(
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets}, DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
} }
def _index_chunk(
url: str,
headers: dict[str, str],
fields: dict[str, Any],
log_error: bool = True,
) -> httpx.Response:
logger.debug(f'Indexing to URL "{url}"')
res = http_client.post(url, headers=headers, json={"fields": fields})
try:
res.raise_for_status()
return res
except Exception as e:
if log_error:
logger.error(
f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
)
raise e
vespa_url = f"{DOCUMENT_ID_ENDPOINT}/{vespa_chunk_id}" vespa_url = f"{DOCUMENT_ID_ENDPOINT}/{vespa_chunk_id}"
logger.debug(f'Indexing to URL "{vespa_url}"')
res = http_client.post(
vespa_url, headers=json_header, json={"fields": vespa_document_fields}
)
try: try:
_index_chunk( res.raise_for_status()
url=vespa_url, except Exception as e:
headers=json_header, logger.exception(
fields=vespa_document_fields, f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
log_error=False,
)
except HTTPError as e:
if cast(Response, e.response).status_code != 400:
raise e
# if it's a 400 response, try again with invalid unicode chars removed
# only doing this on error to avoid having to go through the content
# char by char every time
vespa_document_fields[BLURB] = remove_invalid_unicode_chars(
cast(str, vespa_document_fields[BLURB])
)
vespa_document_fields[SEMANTIC_IDENTIFIER] = remove_invalid_unicode_chars(
cast(str, vespa_document_fields[SEMANTIC_IDENTIFIER])
)
vespa_document_fields[TITLE] = remove_invalid_unicode_chars(
cast(str, vespa_document_fields[TITLE])
)
vespa_document_fields[CONTENT] = remove_invalid_unicode_chars(
cast(str, vespa_document_fields[CONTENT])
)
vespa_document_fields[CONTENT_SUMMARY] = remove_invalid_unicode_chars(
cast(str, vespa_document_fields[CONTENT_SUMMARY])
)
_index_chunk(
url=vespa_url,
headers=json_header,
fields=vespa_document_fields,
log_error=True,
) )
raise e
def _batch_index_vespa_chunks( def _batch_index_vespa_chunks(