113 lines
4.0 KiB
Python

import re
import time
from typing import cast
import httpx
from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
from onyx.document_index.vespa_constants import VESPA_APP_CONTAINER_URL
from onyx.utils.logger import setup_logger
logger = setup_logger()
# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
# See here for reference: https://docs.vespa.ai/en/documents.html
# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
# Define allowed ASCII characters
ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
ALLOWED_ASCII_CHARS[0x9] = True # tab
ALLOWED_ASCII_CHARS[0xA] = True # newline
ALLOWED_ASCII_CHARS[0xD] = True # carriage return
for i in range(0x20, 0x7F):
ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars
ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed
def is_text_character(codepoint: int) -> bool:
"""Returns whether the given codepoint is a valid text character."""
if codepoint < 0x80:
return ALLOWED_ASCII_CHARS[codepoint]
if codepoint < 0xD800:
return True
if codepoint <= 0xDFFF:
return False
if codepoint < 0xFDD0:
return True
if codepoint <= 0xFDEF:
return False
if codepoint >= 0x10FFFE:
return False
return (codepoint & 0xFFFF) < 0xFFFE
def replace_invalid_doc_id_characters(text: str) -> str:
"""Replaces invalid document ID characters in text.
NOTE: this must be called at the start of every vespa-related operation or else we
risk discrepancies -> silent failures on deletion/update/insertion."""
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
# and users only seem to be running into this error with single quotes
return text.replace("'", "_")
def remove_invalid_unicode_chars(text: str) -> str:
"""Vespa does not take in unicode chars that aren't valid for XML.
This removes them."""
_illegal_xml_chars_RE: re.Pattern = re.compile(
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]"
)
return _illegal_xml_chars_RE.sub("", text)
def get_vespa_http_client(no_timeout: bool = False, http2: bool = True) -> httpx.Client:
"""
Configure and return an HTTP client for communicating with Vespa,
including authentication if needed.
"""
return httpx.Client(
cert=cast(tuple[str, str], (VESPA_CLOUD_CERT_PATH, VESPA_CLOUD_KEY_PATH))
if MANAGED_VESPA
else None,
verify=False if not MANAGED_VESPA else True,
timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
http2=http2,
)
def wait_for_vespa_with_timeout(wait_interval: int = 5, wait_limit: int = 60) -> bool:
"""Waits for Vespa to become ready subject to a timeout.
Returns True if Vespa is ready, False otherwise."""
time_start = time.monotonic()
logger.info("Vespa: Readiness probe starting.")
while True:
try:
client = get_vespa_http_client()
response = client.get(f"{VESPA_APP_CONTAINER_URL}/state/v1/health")
response.raise_for_status()
response_dict = response.json()
if response_dict["status"]["code"] == "up":
logger.info("Vespa: Readiness probe succeeded. Continuing...")
return True
except Exception:
pass
time_elapsed = time.monotonic() - time_start
if time_elapsed > wait_limit:
logger.info(
f"Vespa: Readiness probe did not succeed within the timeout "
f"({wait_limit} seconds)."
)
return False
logger.info(
f"Vespa: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={wait_limit:.1f}"
)
time.sleep(wait_interval)