mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-15 18:40:56 +02:00
113 lines
4.0 KiB
Python
113 lines
4.0 KiB
Python
import re
|
|
import time
|
|
from typing import cast
|
|
|
|
import httpx
|
|
|
|
from onyx.configs.app_configs import MANAGED_VESPA
|
|
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
|
|
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
|
|
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
|
|
from onyx.document_index.vespa_constants import VESPA_APP_CONTAINER_URL
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
|
|
# See here for reference: https://docs.vespa.ai/en/documents.html
|
|
# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
|
|
|
|
# Define allowed ASCII characters
|
|
ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
|
|
ALLOWED_ASCII_CHARS[0x9] = True # tab
|
|
ALLOWED_ASCII_CHARS[0xA] = True # newline
|
|
ALLOWED_ASCII_CHARS[0xD] = True # carriage return
|
|
for i in range(0x20, 0x7F):
|
|
ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars
|
|
ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed
|
|
|
|
|
|
def is_text_character(codepoint: int) -> bool:
|
|
"""Returns whether the given codepoint is a valid text character."""
|
|
if codepoint < 0x80:
|
|
return ALLOWED_ASCII_CHARS[codepoint]
|
|
if codepoint < 0xD800:
|
|
return True
|
|
if codepoint <= 0xDFFF:
|
|
return False
|
|
if codepoint < 0xFDD0:
|
|
return True
|
|
if codepoint <= 0xFDEF:
|
|
return False
|
|
if codepoint >= 0x10FFFE:
|
|
return False
|
|
return (codepoint & 0xFFFF) < 0xFFFE
|
|
|
|
|
|
def replace_invalid_doc_id_characters(text: str) -> str:
|
|
"""Replaces invalid document ID characters in text.
|
|
NOTE: this must be called at the start of every vespa-related operation or else we
|
|
risk discrepancies -> silent failures on deletion/update/insertion."""
|
|
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
|
# and users only seem to be running into this error with single quotes
|
|
return text.replace("'", "_")
|
|
|
|
|
|
def remove_invalid_unicode_chars(text: str) -> str:
|
|
"""Vespa does not take in unicode chars that aren't valid for XML.
|
|
This removes them."""
|
|
_illegal_xml_chars_RE: re.Pattern = re.compile(
|
|
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]"
|
|
)
|
|
return _illegal_xml_chars_RE.sub("", text)
|
|
|
|
|
|
def get_vespa_http_client(no_timeout: bool = False, http2: bool = True) -> httpx.Client:
|
|
"""
|
|
Configure and return an HTTP client for communicating with Vespa,
|
|
including authentication if needed.
|
|
"""
|
|
|
|
return httpx.Client(
|
|
cert=cast(tuple[str, str], (VESPA_CLOUD_CERT_PATH, VESPA_CLOUD_KEY_PATH))
|
|
if MANAGED_VESPA
|
|
else None,
|
|
verify=False if not MANAGED_VESPA else True,
|
|
timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
|
|
http2=http2,
|
|
)
|
|
|
|
|
|
def wait_for_vespa_with_timeout(wait_interval: int = 5, wait_limit: int = 60) -> bool:
|
|
"""Waits for Vespa to become ready subject to a timeout.
|
|
Returns True if Vespa is ready, False otherwise."""
|
|
|
|
time_start = time.monotonic()
|
|
logger.info("Vespa: Readiness probe starting.")
|
|
while True:
|
|
try:
|
|
client = get_vespa_http_client()
|
|
response = client.get(f"{VESPA_APP_CONTAINER_URL}/state/v1/health")
|
|
response.raise_for_status()
|
|
|
|
response_dict = response.json()
|
|
if response_dict["status"]["code"] == "up":
|
|
logger.info("Vespa: Readiness probe succeeded. Continuing...")
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
time_elapsed = time.monotonic() - time_start
|
|
if time_elapsed > wait_limit:
|
|
logger.info(
|
|
f"Vespa: Readiness probe did not succeed within the timeout "
|
|
f"({wait_limit} seconds)."
|
|
)
|
|
return False
|
|
|
|
logger.info(
|
|
f"Vespa: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={wait_limit:.1f}"
|
|
)
|
|
|
|
time.sleep(wait_interval)
|