add timings for syncing (#3798)

* add timings for syncing

* add more logging

* more debugging

* refactor multipass/db check out of VespaIndex

* circular imports?

* more debugging

* add logs

* various improvements

* additional logs to narrow down issue

* use global httpx pool for the main vespa flows in celery. Use in more places eventually.

* cleanup debug logging, etc

* remove debug logging

* this should use the secondary index

* mypy

* missed some logging

* review fixes

* refactor get_default_document_index to use search settings

* more missed logging

* fix circular refs

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
Co-authored-by: pablodanswer <pablo@danswer.ai>
This commit is contained in:
rkuo-danswer 2025-01-29 15:24:44 -08:00 committed by GitHub
parent d35f93b233
commit 4fe99d05fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 489 additions and 215 deletions

View File

@ -24,6 +24,7 @@ from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import OnyxRedisLocks
from onyx.db.engine import get_sqlalchemy_engine
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
from onyx.httpx.httpx_pool import HttpxPool
from onyx.redis.redis_connector import RedisConnector
from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
from onyx.redis.redis_connector_delete import RedisConnectorDelete
@ -316,6 +317,8 @@ def on_worker_ready(sender: Any, **kwargs: Any) -> None:
def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
HttpxPool.close_all()
if not celery_is_worker_primary(sender):
return

View File

@ -10,6 +10,10 @@ from celery.signals import worker_ready
from celery.signals import worker_shutdown
import onyx.background.celery.apps.app_base as app_base
from onyx.background.celery.celery_utils import httpx_init_vespa_pool
from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
from onyx.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME
from onyx.db.engine import SqlEngine
from onyx.utils.logger import setup_logger
@ -54,12 +58,23 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
@worker_init.connect
def on_worker_init(sender: Worker, **kwargs: Any) -> None:
EXTRA_CONCURRENCY = 8 # small extra fudge factor for connection limits
logger.info("worker_init signal received.")
logger.info(f"Concurrency: {sender.concurrency}") # type: ignore
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8) # type: ignore
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=EXTRA_CONCURRENCY) # type: ignore
if MANAGED_VESPA:
httpx_init_vespa_pool(
sender.concurrency + EXTRA_CONCURRENCY, # type: ignore
ssl_cert=VESPA_CLOUD_CERT_PATH,
ssl_key=VESPA_CLOUD_KEY_PATH,
)
else:
httpx_init_vespa_pool(sender.concurrency + EXTRA_CONCURRENCY) # type: ignore
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)

View File

@ -1,10 +1,13 @@
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import cast
import httpx
from sqlalchemy.orm import Session
from onyx.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
@ -17,6 +20,7 @@ from onyx.db.connector_credential_pair import get_connector_credential_pair
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import TaskStatus
from onyx.db.models import TaskQueueState
from onyx.httpx.httpx_pool import HttpxPool
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.redis.redis_connector import RedisConnector
from onyx.server.documents.models import DeletionAttemptSnapshot
@ -154,3 +158,25 @@ def celery_is_worker_primary(worker: Any) -> bool:
return True
return False
def httpx_init_vespa_pool(
max_keepalive_connections: int,
timeout: int = VESPA_REQUEST_TIMEOUT,
ssl_cert: str | None = None,
ssl_key: str | None = None,
) -> None:
httpx_cert = None
httpx_verify = False
if ssl_cert and ssl_key:
httpx_cert = cast(tuple[str, str], (ssl_cert, ssl_key))
httpx_verify = True
HttpxPool.init_client(
name="vespa",
cert=httpx_cert,
verify=httpx_verify,
timeout=timeout,
http2=False,
limits=httpx.Limits(max_keepalive_connections=max_keepalive_connections),
)

View File

@ -15,6 +15,7 @@ from redis import Redis
from redis.lock import Lock as RedisLock
from onyx.background.celery.apps.app_base import task_logger
from onyx.background.celery.celery_utils import httpx_init_vespa_pool
from onyx.background.celery.tasks.indexing.utils import _should_index
from onyx.background.celery.tasks.indexing.utils import get_unfenced_index_attempt_ids
from onyx.background.celery.tasks.indexing.utils import IndexingCallback
@ -22,6 +23,9 @@ from onyx.background.celery.tasks.indexing.utils import try_creating_indexing_ta
from onyx.background.celery.tasks.indexing.utils import validate_indexing_fences
from onyx.background.indexing.job_client import SimpleJobClient
from onyx.background.indexing.run_indexing import run_indexing_entrypoint
from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
from onyx.configs.constants import CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT
@ -37,8 +41,7 @@ from onyx.db.index_attempt import get_index_attempt
from onyx.db.index_attempt import get_last_attempt_for_cc_pair
from onyx.db.index_attempt import mark_attempt_canceled
from onyx.db.index_attempt import mark_attempt_failed
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_active_search_settings_list
from onyx.db.search_settings import get_current_search_settings
from onyx.db.swap_index import check_index_swap
from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
@ -121,9 +124,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
redis_connector = RedisConnector(tenant_id, cc_pair_id)
with get_session_with_tenant(tenant_id) as db_session:
search_settings_list: list[SearchSettings] = get_active_search_settings(
db_session
)
search_settings_list = get_active_search_settings_list(db_session)
for search_settings_instance in search_settings_list:
redis_connector_index = redis_connector.new_index(
search_settings_instance.id
@ -303,6 +304,14 @@ def connector_indexing_task(
attempt_found = False
n_final_progress: int | None = None
# 20 is the documented default for httpx max_keepalive_connections
if MANAGED_VESPA:
httpx_init_vespa_pool(
20, ssl_cert=VESPA_CLOUD_CERT_PATH, ssl_key=VESPA_CLOUD_KEY_PATH
)
else:
httpx_init_vespa_pool(20)
redis_connector = RedisConnector(tenant_id, cc_pair_id)
redis_connector_index = redis_connector.new_index(search_settings_id)

View File

@ -34,7 +34,7 @@ from onyx.db.models import DocumentSet
from onyx.db.models import IndexAttempt
from onyx.db.models import SyncRecord
from onyx.db.models import UserGroup
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_active_search_settings_list
from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import redis_lock_dump
from onyx.utils.telemetry import optional_telemetry
@ -315,13 +315,13 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
# Get all connector credential pairs
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
# Might be more than one search setting, or just one
active_search_settings = get_active_search_settings(db_session)
active_search_settings_list = get_active_search_settings_list(db_session)
metrics = []
# If you want to process each cc_pair against each search setting:
for cc_pair in cc_pairs:
for search_settings in active_search_settings:
for search_settings in active_search_settings_list:
recent_attempts = (
db_session.query(IndexAttempt)
.filter(

View File

@ -27,9 +27,10 @@ from onyx.db.document import mark_document_as_synced
from onyx.db.document_set import fetch_document_sets_for_document
from onyx.db.engine import get_all_tenant_ids
from onyx.db.engine import get_session_with_tenant
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.db.search_settings import get_active_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.httpx.httpx_pool import HttpxPool
from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import redis_lock_dump
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
@ -79,9 +80,11 @@ def document_by_cc_pair_cleanup_task(
action = "skip"
chunks_affected = 0
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
active_search_settings = get_active_search_settings(db_session)
doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
active_search_settings.primary,
active_search_settings.secondary,
httpx_client=HttpxPool.get("vespa"),
)
retry_index = RetryDocumentIndex(doc_index)

View File

@ -61,12 +61,13 @@ from onyx.db.index_attempt import get_index_attempt
from onyx.db.index_attempt import mark_attempt_failed
from onyx.db.models import DocumentSet
from onyx.db.models import UserGroup
from onyx.db.search_settings import get_active_search_settings
from onyx.db.sync_record import cleanup_sync_records
from onyx.db.sync_record import insert_sync_record
from onyx.db.sync_record import update_sync_record_status
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.httpx.httpx_pool import HttpxPool
from onyx.redis.redis_connector import RedisConnector
from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
from onyx.redis.redis_connector_delete import RedisConnectorDelete
@ -1096,9 +1097,11 @@ def vespa_metadata_sync_task(
try:
with get_session_with_tenant(tenant_id) as db_session:
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
active_search_settings = get_active_search_settings(db_session)
doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
search_settings=active_search_settings.primary,
secondary_search_settings=active_search_settings.secondary,
httpx_client=HttpxPool.get("vespa"),
)
retry_index = RetryDocumentIndex(doc_index)

View File

@ -35,6 +35,7 @@ from onyx.db.models import IndexAttempt
from onyx.db.models import IndexingStatus
from onyx.db.models import IndexModelStatus
from onyx.document_index.factory import get_default_document_index
from onyx.httpx.httpx_pool import HttpxPool
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
@ -219,9 +220,10 @@ def _run_indexing(
callback=callback,
)
# Indexing is only done into one index at a time
document_index = get_default_document_index(
primary_index_name=ctx.index_name, secondary_index_name=None
index_attempt_start.search_settings,
None,
httpx_client=HttpxPool.get("vespa"),
)
indexing_pipeline = build_indexing_pipeline(

View File

@ -426,9 +426,7 @@ def stream_chat_message_objects(
)
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
document_index = get_default_document_index(search_settings, None)
# Every chat Session begins with an empty root message
root_message = get_or_create_root_message(

View File

@ -67,10 +67,7 @@ class SearchPipeline:
self.rerank_metrics_callback = rerank_metrics_callback
self.search_settings = get_current_search_settings(db_session)
self.document_index = get_default_document_index(
primary_index_name=self.search_settings.index_name,
secondary_index_name=None,
)
self.document_index = get_default_document_index(self.search_settings, None)
self.prompt_config: PromptConfig | None = prompt_config
# Preprocessing steps generate this

View File

@ -747,6 +747,34 @@ class SearchSettings(Base):
def api_key(self) -> str | None:
return self.cloud_provider.api_key if self.cloud_provider is not None else None
@property
def large_chunks_enabled(self) -> bool:
"""
Given multipass usage and an embedder, decides whether large chunks are allowed
based on model/provider constraints.
"""
# Only local models that support a larger context are from Nomic
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
return SearchSettings.can_use_large_chunks(
self.multipass_indexing, self.model_name, self.provider_type
)
@staticmethod
def can_use_large_chunks(
multipass: bool, model_name: str, provider_type: EmbeddingProvider | None
) -> bool:
"""
Given multipass usage and an embedder, decides whether large chunks are allowed
based on model/provider constraints.
"""
# Only local models that support a larger context are from Nomic
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
return (
multipass
and model_name.startswith("nomic-ai")
and provider_type != EmbeddingProvider.COHERE
)
class IndexAttempt(Base):
"""

View File

@ -29,9 +29,21 @@ from onyx.utils.logger import setup_logger
from shared_configs.configs import PRESERVED_SEARCH_FIELDS
from shared_configs.enums import EmbeddingProvider
logger = setup_logger()
class ActiveSearchSettings:
primary: SearchSettings
secondary: SearchSettings | None
def __init__(
self, primary: SearchSettings, secondary: SearchSettings | None
) -> None:
self.primary = primary
self.secondary = secondary
def create_search_settings(
search_settings: SavedSearchSettings,
db_session: Session,
@ -143,21 +155,27 @@ def get_secondary_search_settings(db_session: Session) -> SearchSettings | None:
return latest_settings
def get_active_search_settings(db_session: Session) -> list[SearchSettings]:
"""Returns active search settings. The first entry will always be the current search
settings. If there are new search settings that are being migrated to, those will be
the second entry."""
def get_active_search_settings(db_session: Session) -> ActiveSearchSettings:
"""Returns active search settings. Secondary search settings may be None."""
# Get the primary and secondary search settings
primary_search_settings = get_current_search_settings(db_session)
secondary_search_settings = get_secondary_search_settings(db_session)
return ActiveSearchSettings(
primary=primary_search_settings, secondary=secondary_search_settings
)
def get_active_search_settings_list(db_session: Session) -> list[SearchSettings]:
"""Returns active search settings as a list. Primary settings are the first element,
and if secondary search settings exist, they will be the second element."""
search_settings_list: list[SearchSettings] = []
# Get the primary search settings
primary_search_settings = get_current_search_settings(db_session)
search_settings_list.append(primary_search_settings)
# Check for secondary search settings
secondary_search_settings = get_secondary_search_settings(db_session)
if secondary_search_settings is not None:
# If secondary settings exist, add them to the list
search_settings_list.append(secondary_search_settings)
active_search_settings = get_active_search_settings(db_session)
search_settings_list.append(active_search_settings.primary)
if active_search_settings.secondary:
search_settings_list.append(active_search_settings.secondary)
return search_settings_list

View File

@ -4,24 +4,63 @@ from uuid import UUID
from sqlalchemy.orm import Session
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import MultipassConfig
from shared_configs.configs import MULTI_TENANT
DEFAULT_BATCH_SIZE = 30
DEFAULT_INDEX_NAME = "danswer_chunk"
def get_both_index_names(db_session: Session) -> tuple[str, str | None]:
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
"""
Determines whether multipass should be used based on the search settings
or the default config if settings are unavailable.
"""
if search_settings is not None:
return search_settings.multipass_indexing
return ENABLE_MULTIPASS_INDEXING
def get_multipass_config(search_settings: SearchSettings) -> MultipassConfig:
"""
Determines whether to enable multipass and large chunks by examining
the current search settings and the embedder configuration.
"""
if not search_settings:
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
multipass = should_use_multipass(search_settings)
enable_large_chunks = SearchSettings.can_use_large_chunks(
multipass, search_settings.model_name, search_settings.provider_type
)
return MultipassConfig(
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
)
def get_both_index_properties(
db_session: Session,
) -> tuple[str, str | None, bool, bool | None]:
search_settings = get_current_search_settings(db_session)
config_1 = get_multipass_config(search_settings)
search_settings_new = get_secondary_search_settings(db_session)
if not search_settings_new:
return search_settings.index_name, None
return search_settings.index_name, None, config_1.enable_large_chunks, None
return search_settings.index_name, search_settings_new.index_name
config_2 = get_multipass_config(search_settings)
return (
search_settings.index_name,
search_settings_new.index_name,
config_1.enable_large_chunks,
config_2.enable_large_chunks,
)
def translate_boost_count_to_multiplier(boost: int) -> float:

View File

@ -1,5 +1,7 @@
import httpx
from sqlalchemy.orm import Session
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.vespa.index import VespaIndex
@ -7,17 +9,28 @@ from shared_configs.configs import MULTI_TENANT
def get_default_document_index(
primary_index_name: str,
secondary_index_name: str | None,
search_settings: SearchSettings,
secondary_search_settings: SearchSettings | None,
httpx_client: httpx.Client | None = None,
) -> DocumentIndex:
"""Primary index is the index that is used for querying/updating etc.
Secondary index is for when both the currently used index and the upcoming
index both need to be updated, updates are applied to both indices"""
secondary_index_name: str | None = None
secondary_large_chunks_enabled: bool | None = None
if secondary_search_settings:
secondary_index_name = secondary_search_settings.index_name
secondary_large_chunks_enabled = secondary_search_settings.large_chunks_enabled
# Currently only supporting Vespa
return VespaIndex(
index_name=primary_index_name,
index_name=search_settings.index_name,
secondary_index_name=secondary_index_name,
large_chunks_enabled=search_settings.large_chunks_enabled,
secondary_large_chunks_enabled=secondary_large_chunks_enabled,
multitenant=MULTI_TENANT,
httpx_client=httpx_client,
)
@ -27,6 +40,6 @@ def get_current_primary_default_document_index(db_session: Session) -> DocumentI
"""
search_settings = get_current_search_settings(db_session)
return get_default_document_index(
primary_index_name=search_settings.index_name,
secondary_index_name=None,
search_settings,
None,
)

View File

@ -231,21 +231,22 @@ def _get_chunks_via_visit_api(
return document_chunks
@retry(tries=10, delay=1, backoff=2)
def get_all_vespa_ids_for_document_id(
document_id: str,
index_name: str,
filters: IndexFilters | None = None,
get_large_chunks: bool = False,
) -> list[str]:
document_chunks = _get_chunks_via_visit_api(
chunk_request=VespaChunkRequest(document_id=document_id),
index_name=index_name,
filters=filters or IndexFilters(access_control_list=None),
field_names=[DOCUMENT_ID],
get_large_chunks=get_large_chunks,
)
return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
# TODO(rkuo): candidate for removal if not being used
# @retry(tries=10, delay=1, backoff=2)
# def get_all_vespa_ids_for_document_id(
# document_id: str,
# index_name: str,
# filters: IndexFilters | None = None,
# get_large_chunks: bool = False,
# ) -> list[str]:
# document_chunks = _get_chunks_via_visit_api(
# chunk_request=VespaChunkRequest(document_id=document_id),
# index_name=index_name,
# filters=filters or IndexFilters(access_control_list=None),
# field_names=[DOCUMENT_ID],
# get_large_chunks=get_large_chunks,
# )
# return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
def parallel_visit_api_retrieval(

View File

@ -25,7 +25,6 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
from onyx.configs.constants import KV_REINDEX_KEY
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.db.engine import get_session_with_tenant
from onyx.document_index.document_index_utils import get_document_chunk_ids
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentInsertionRecord
@ -41,12 +40,12 @@ from onyx.document_index.vespa.chunk_retrieval import (
)
from onyx.document_index.vespa.chunk_retrieval import query_vespa
from onyx.document_index.vespa.deletion import delete_vespa_chunks
from onyx.document_index.vespa.indexing_utils import BaseHTTPXClientContext
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.document_index.vespa.indexing_utils import GlobalHTTPXClientContext
from onyx.document_index.vespa.indexing_utils import TemporaryHTTPXClientContext
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.utils import (
replace_invalid_doc_id_characters,
@ -132,12 +131,34 @@ class VespaIndex(DocumentIndex):
self,
index_name: str,
secondary_index_name: str | None,
large_chunks_enabled: bool,
secondary_large_chunks_enabled: bool | None,
multitenant: bool = False,
httpx_client: httpx.Client | None = None,
) -> None:
self.index_name = index_name
self.secondary_index_name = secondary_index_name
self.large_chunks_enabled = large_chunks_enabled
self.secondary_large_chunks_enabled = secondary_large_chunks_enabled
self.multitenant = multitenant
self.http_client = get_vespa_http_client()
self.httpx_client_context: BaseHTTPXClientContext
if httpx_client:
self.httpx_client_context = GlobalHTTPXClientContext(httpx_client)
else:
self.httpx_client_context = TemporaryHTTPXClientContext(
get_vespa_http_client
)
self.index_to_large_chunks_enabled: dict[str, bool] = {}
self.index_to_large_chunks_enabled[index_name] = large_chunks_enabled
if secondary_index_name and secondary_large_chunks_enabled:
self.index_to_large_chunks_enabled[
secondary_index_name
] = secondary_large_chunks_enabled
def ensure_indices_exist(
self,
@ -331,7 +352,7 @@ class VespaIndex(DocumentIndex):
# indexing / updates / deletes since we have to make a large volume of requests.
with (
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
get_vespa_http_client() as http_client,
self.httpx_client_context as http_client,
):
# We require the start and end index for each document in order to
# know precisely which chunks to delete. This information exists for
@ -390,9 +411,11 @@ class VespaIndex(DocumentIndex):
for doc_id in all_doc_ids
}
@staticmethod
@classmethod
def _apply_updates_batched(
cls,
updates: list[_VespaUpdateRequest],
httpx_client: httpx.Client,
batch_size: int = BATCH_SIZE,
) -> None:
"""Runs a batch of updates in parallel via the ThreadPoolExecutor."""
@ -414,7 +437,7 @@ class VespaIndex(DocumentIndex):
with (
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
get_vespa_http_client() as http_client,
httpx_client as http_client,
):
for update_batch in batch_generator(updates, batch_size):
future_to_document_id = {
@ -455,7 +478,7 @@ class VespaIndex(DocumentIndex):
index_names.append(self.secondary_index_name)
chunk_id_start_time = time.monotonic()
with get_vespa_http_client() as http_client:
with self.httpx_client_context as http_client:
for update_request in update_requests:
for doc_info in update_request.minimal_document_indexing_info:
for index_name in index_names:
@ -511,7 +534,8 @@ class VespaIndex(DocumentIndex):
)
)
self._apply_updates_batched(processed_updates_requests)
with self.httpx_client_context as httpx_client:
self._apply_updates_batched(processed_updates_requests, httpx_client)
logger.debug(
"Finished updating Vespa documents in %.2f seconds",
time.monotonic() - update_start,
@ -523,6 +547,7 @@ class VespaIndex(DocumentIndex):
index_name: str,
fields: VespaDocumentFields,
doc_id: str,
http_client: httpx.Client,
) -> None:
"""
Update a single "chunk" (document) in Vespa using its chunk ID.
@ -554,18 +579,17 @@ class VespaIndex(DocumentIndex):
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
with get_vespa_http_client(http2=False) as http_client:
try:
resp = http_client.put(
vespa_url,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
logger.error(error_message)
raise
try:
resp = http_client.put(
vespa_url,
headers={"Content-Type": "application/json"},
json=update_dict,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
logger.error(error_message)
raise
def update_single(
self,
@ -579,24 +603,16 @@ class VespaIndex(DocumentIndex):
function will complete with no errors or exceptions.
Handle other exceptions if you wish to implement retry behavior
"""
doc_chunk_count = 0
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with get_vespa_http_client(http2=False) as http_client:
for index_name in index_names:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
multipass_config = get_multipass_config(
db_session=db_session,
primary_index=index_name == self.index_name,
)
large_chunks_enabled = multipass_config.enable_large_chunks
with self.httpx_client_context as httpx_client:
for (
index_name,
large_chunks_enabled,
) in self.index_to_large_chunks_enabled.items():
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
http_client=httpx_client,
document_id=doc_id,
previous_chunk_count=chunk_count,
new_chunk_count=0,
@ -612,10 +628,7 @@ class VespaIndex(DocumentIndex):
for doc_chunk_id in doc_chunk_ids:
self.update_single_chunk(
doc_chunk_id=doc_chunk_id,
index_name=index_name,
fields=fields,
doc_id=doc_id,
doc_chunk_id, index_name, fields, doc_id, httpx_client
)
return doc_chunk_count
@ -637,19 +650,13 @@ class VespaIndex(DocumentIndex):
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with get_vespa_http_client(
http2=False
) as http_client, concurrent.futures.ThreadPoolExecutor(
with self.httpx_client_context as http_client, concurrent.futures.ThreadPoolExecutor(
max_workers=NUM_THREADS
) as executor:
for index_name in index_names:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
multipass_config = get_multipass_config(
db_session=db_session,
primary_index=index_name == self.index_name,
)
large_chunks_enabled = multipass_config.enable_large_chunks
for (
index_name,
large_chunks_enabled,
) in self.index_to_large_chunks_enabled.items():
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
index_name=index_name,
http_client=http_client,
@ -818,6 +825,9 @@ class VespaIndex(DocumentIndex):
"""
Deletes all entries in the specified index with the given tenant_id.
Currently unused, but we anticipate this being useful. The entire flow does not
use the httpx connection pool of an instance.
Parameters:
tenant_id (str): The tenant ID whose documents are to be deleted.
index_name (str): The name of the index from which to delete documents.
@ -850,6 +860,8 @@ class VespaIndex(DocumentIndex):
"""
Retrieves all document IDs with the specified tenant_id, handling pagination.
Internal helper function for delete_entries_by_tenant_id.
Parameters:
tenant_id (str): The tenant ID to search for.
index_name (str): The name of the index to search in.
@ -882,8 +894,8 @@ class VespaIndex(DocumentIndex):
f"Querying for document IDs with tenant_id: {tenant_id}, offset: {offset}"
)
with get_vespa_http_client(no_timeout=True) as http_client:
response = http_client.get(url, params=query_params)
with get_vespa_http_client() as http_client:
response = http_client.get(url, params=query_params, timeout=None)
response.raise_for_status()
search_result = response.json()
@ -913,6 +925,11 @@ class VespaIndex(DocumentIndex):
"""
Deletes documents in batches using multiple threads.
Internal helper function for delete_entries_by_tenant_id.
This is a class method and does not use the httpx pool of the instance.
This is OK because we don't use this method often.
Parameters:
delete_requests (List[_VespaDeleteRequest]): The list of delete requests.
batch_size (int): The number of documents to delete in each batch.
@ -925,13 +942,14 @@ class VespaIndex(DocumentIndex):
response = http_client.delete(
delete_request.url,
headers={"Content-Type": "application/json"},
timeout=None,
)
response.raise_for_status()
logger.debug(f"Starting batch deletion for {len(delete_requests)} documents")
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
with get_vespa_http_client(no_timeout=True) as http_client:
with get_vespa_http_client() as http_client:
for batch_start in range(0, len(delete_requests), batch_size):
batch = delete_requests[batch_start : batch_start + batch_size]

View File

@ -1,21 +1,19 @@
import concurrent.futures
import json
import uuid
from abc import ABC
from abc import abstractmethod
from collections.abc import Callable
from datetime import datetime
from datetime import timezone
from http import HTTPStatus
import httpx
from retry import retry
from sqlalchemy.orm import Session
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from onyx.db.models import SearchSettings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.document_index_utils import get_uuid_from_chunk
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
@ -50,10 +48,9 @@ from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import EmbeddingProvider
from onyx.indexing.models import MultipassConfig
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -275,46 +272,42 @@ def check_for_final_chunk_existence(
index += 1
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
"""
Determines whether multipass should be used based on the search settings
or the default config if settings are unavailable.
"""
if search_settings is not None:
return search_settings.multipass_indexing
return ENABLE_MULTIPASS_INDEXING
class BaseHTTPXClientContext(ABC):
"""Abstract base class for an HTTPX client context manager."""
@abstractmethod
def __enter__(self) -> httpx.Client:
pass
@abstractmethod
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
pass
def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool:
"""
Given multipass usage and an embedder, decides whether large chunks are allowed
based on model/provider constraints.
"""
# Only local models that support a larger context are from Nomic
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
return (
multipass
and search_settings.model_name.startswith("nomic-ai")
and search_settings.provider_type != EmbeddingProvider.COHERE
)
class GlobalHTTPXClientContext(BaseHTTPXClientContext):
"""Context manager for a global HTTPX client that does not close it."""
def __init__(self, client: httpx.Client):
self._client = client
def __enter__(self) -> httpx.Client:
return self._client # Reuse the global client
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
pass # Do nothing; don't close the global client
def get_multipass_config(
db_session: Session, primary_index: bool = True
) -> MultipassConfig:
"""
Determines whether to enable multipass and large chunks by examining
the current search settings and the embedder configuration.
"""
search_settings = (
get_current_search_settings(db_session)
if primary_index
else get_secondary_search_settings(db_session)
)
multipass = should_use_multipass(search_settings)
if not search_settings:
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
enable_large_chunks = can_use_large_chunks(multipass, search_settings)
return MultipassConfig(
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
)
class TemporaryHTTPXClientContext(BaseHTTPXClientContext):
"""Context manager for a temporary HTTPX client that closes it after use."""
def __init__(self, client_factory: Callable[[], httpx.Client]):
self._client_factory = client_factory
self._client: httpx.Client | None = None # Client will be created in __enter__
def __enter__(self) -> httpx.Client:
self._client = self._client_factory() # Create a new client
return self._client
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
if self._client:
self._client.close()

View File

@ -0,0 +1,57 @@
import threading
from typing import Any
import httpx
class HttpxPool:
"""Class to manage a global httpx Client instance"""
_clients: dict[str, httpx.Client] = {}
_lock: threading.Lock = threading.Lock()
# Default parameters for creation
DEFAULT_KWARGS = {
"http2": True,
"limits": lambda: httpx.Limits(),
}
def __init__(self) -> None:
pass
@classmethod
def _init_client(cls, **kwargs: Any) -> httpx.Client:
"""Private helper method to create and return an httpx.Client."""
merged_kwargs = {**cls.DEFAULT_KWARGS, **kwargs}
return httpx.Client(**merged_kwargs)
@classmethod
def init_client(cls, name: str, **kwargs: Any) -> None:
"""Allow the caller to init the client with extra params."""
with cls._lock:
if name not in cls._clients:
cls._clients[name] = cls._init_client(**kwargs)
@classmethod
def close_client(cls, name: str) -> None:
"""Allow the caller to close the client."""
with cls._lock:
client = cls._clients.pop(name, None)
if client:
client.close()
@classmethod
def close_all(cls) -> None:
"""Close all registered clients."""
with cls._lock:
for client in cls._clients.values():
client.close()
cls._clients.clear()
@classmethod
def get(cls, name: str) -> httpx.Client:
"""Gets the httpx.Client. Will init to default settings if not init'd."""
with cls._lock:
if name not in cls._clients:
cls._clients[name] = cls._init_client()
return cls._clients[name]

View File

@ -31,14 +31,15 @@ from onyx.db.document import upsert_documents
from onyx.db.document_set import fetch_document_sets_for_documents
from onyx.db.index_attempt import create_index_attempt_error
from onyx.db.models import Document as DBDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag
from onyx.db.tag import create_or_add_document_tag_list
from onyx.document_index.document_index_utils import (
get_multipass_config,
)
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams
from onyx.document_index.vespa.indexing_utils import (
get_multipass_config,
)
from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import IndexingEmbedder
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
@ -357,7 +358,6 @@ def index_doc_batch(
is_public=False,
)
logger.debug("Filtering Documents")
filtered_documents = filter_fnc(document_batch)
ctx = index_doc_batch_prepare(
@ -527,7 +527,8 @@ def build_indexing_pipeline(
callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol:
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
multipass_config = get_multipass_config(db_session, primary_index=True)
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
chunker = chunker or Chunker(
tokenizer=embedder.embedding_model.tokenizer,

View File

@ -55,9 +55,7 @@ class DocAwareChunk(BaseChunk):
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a chunk"""
return (
f"Chunk ID: '{self.chunk_id}'; {self.source_document.to_short_descriptor()}"
)
return f"{self.source_document.to_short_descriptor()} Chunk ID: {self.chunk_id}"
class IndexChunk(DocAwareChunk):

View File

@ -16,7 +16,7 @@ from onyx.context.search.preprocessing.access_filters import (
from onyx.db.document_set import get_document_sets_by_ids
from onyx.db.models import StarterMessageModel as StarterMessage
from onyx.db.models import User
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.db.search_settings import get_active_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.llm.factory import get_default_llms
from onyx.prompts.starter_messages import format_persona_starter_message_prompt
@ -34,8 +34,11 @@ def get_random_chunks_from_doc_sets(
"""
Retrieves random chunks from the specified document sets.
"""
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
document_index = get_default_document_index(curr_ind_name, sec_ind_name)
active_search_settings = get_active_search_settings(db_session)
document_index = get_default_document_index(
search_settings=active_search_settings.primary,
secondary_search_settings=active_search_settings.secondary,
)
acl_filters = build_access_filters_for_user(user, db_session)
filters = IndexFilters(document_set=doc_sets, access_control_list=acl_filters)

View File

@ -3,6 +3,7 @@ import json
import os
from typing import cast
from sqlalchemy import update
from sqlalchemy.orm import Session
from onyx.access.models import default_public_access
@ -23,6 +24,7 @@ from onyx.db.document import check_docs_exist
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.index_attempt import mock_successful_index_attempt
from onyx.db.models import Document as DbDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import IndexBatchParams
@ -59,6 +61,7 @@ def _create_indexable_chunks(
doc_updated_at=None,
primary_owners=[],
secondary_owners=[],
chunk_count=1,
)
if preprocessed_doc["chunk_ind"] == 0:
ids_to_documents[document.id] = document
@ -155,9 +158,7 @@ def seed_initial_documents(
logger.info("Embedding model has been updated, skipping")
return
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
document_index = get_default_document_index(search_settings, None)
# Create a connector so the user can delete it if they want
# or reindex it with a new search model if they want
@ -240,4 +241,12 @@ def seed_initial_documents(
db_session=db_session,
)
# Since we bypass the indexing flow, we need to manually update the chunk count
for doc in docs:
db_session.execute(
update(DbDocument)
.where(DbDocument.id == doc.id)
.values(chunk_count=doc.chunk_count)
)
kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True)

View File

@ -42,7 +42,7 @@ from onyx.db.index_attempt import get_latest_index_attempt_for_cc_pair_id
from onyx.db.index_attempt import get_paginated_index_attempts_for_cc_pair_id
from onyx.db.models import SearchSettings
from onyx.db.models import User
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_active_search_settings_list
from onyx.db.search_settings import get_current_search_settings
from onyx.redis.redis_connector import RedisConnector
from onyx.redis.redis_pool import get_redis_client
@ -192,7 +192,7 @@ def update_cc_pair_status(
if status_update_request.status == ConnectorCredentialPairStatus.PAUSED:
redis_connector.stop.set_fence(True)
search_settings_list: list[SearchSettings] = get_active_search_settings(
search_settings_list: list[SearchSettings] = get_active_search_settings_list(
db_session
)

View File

@ -32,10 +32,7 @@ def get_document_info(
db_session: Session = Depends(get_session),
) -> DocumentInfo:
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
document_index = get_default_document_index(search_settings, None)
user_acl_filters = build_access_filters_for_user(user, db_session)
inference_chunks = document_index.id_based_retrieval(
@ -79,10 +76,7 @@ def get_chunk_info(
db_session: Session = Depends(get_session),
) -> ChunkInfo:
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
document_index = get_default_document_index(search_settings, None)
user_acl_filters = build_access_filters_for_user(user, db_session)
chunk_request = VespaChunkRequest(

View File

@ -22,6 +22,7 @@ from onyx.db.search_settings import get_embedding_provider_from_provider_type
from onyx.db.search_settings import get_secondary_search_settings
from onyx.db.search_settings import update_current_search_settings
from onyx.db.search_settings import update_search_settings_status
from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.factory import get_default_document_index
from onyx.file_processing.unstructured import delete_unstructured_api_key
from onyx.file_processing.unstructured import get_unstructured_api_key
@ -97,10 +98,9 @@ def set_new_search_settings(
)
# Ensure Vespa has the new index immediately
document_index = get_default_document_index(
primary_index_name=search_settings.index_name,
secondary_index_name=new_search_settings.index_name,
)
get_multipass_config(search_settings)
get_multipass_config(new_search_settings)
document_index = get_default_document_index(search_settings, new_search_settings)
document_index.ensure_indices_exist(
index_embedding_dim=search_settings.model_dim,

View File

@ -14,9 +14,9 @@ from onyx.db.document import get_ingestion_documents
from onyx.db.engine import get_current_tenant_id
from onyx.db.engine import get_session
from onyx.db.models import User
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.document_index.factory import get_default_document_index
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
@ -89,9 +89,10 @@ def upsert_ingestion_doc(
)
# Need to index for both the primary and secondary index if possible
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
active_search_settings = get_active_search_settings(db_session)
curr_doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=None
active_search_settings.primary,
None,
)
search_settings = get_current_search_settings(db_session)
@ -117,11 +118,7 @@ def upsert_ingestion_doc(
)
# If there's a secondary index being built, index the doc but don't use it for return here
if sec_ind_name:
sec_doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=None
)
if active_search_settings.secondary:
sec_search_settings = get_secondary_search_settings(db_session)
if sec_search_settings is None:
@ -134,6 +131,10 @@ def upsert_ingestion_doc(
search_settings=sec_search_settings
)
sec_doc_index = get_default_document_index(
active_search_settings.secondary, None
)
sec_ind_pipeline = build_indexing_pipeline(
embedder=new_index_embedding_model,
document_index=sec_doc_index,

View File

@ -64,9 +64,8 @@ def admin_search(
tenant_id=tenant_id,
)
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
document_index = get_default_document_index(search_settings, None)
if not isinstance(document_index, VespaIndex):
raise HTTPException(
status_code=400,

View File

@ -25,6 +25,7 @@ from onyx.db.llm import fetch_default_provider
from onyx.db.llm import update_default_provider
from onyx.db.llm import upsert_llm_provider
from onyx.db.persona import delete_old_default_personas
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.db.search_settings import update_current_search_settings
@ -70,8 +71,19 @@ def setup_onyx(
The Tenant Service calls the tenants/create endpoint which runs this.
"""
check_index_swap(db_session=db_session)
search_settings = get_current_search_settings(db_session)
secondary_search_settings = get_secondary_search_settings(db_session)
active_search_settings = get_active_search_settings(db_session)
search_settings = active_search_settings.primary
secondary_search_settings = active_search_settings.secondary
# search_settings = get_current_search_settings(db_session)
# multipass_config_1 = get_multipass_config(search_settings)
# secondary_large_chunks_enabled: bool | None = None
# secondary_search_settings = get_secondary_search_settings(db_session)
# if secondary_search_settings:
# multipass_config_2 = get_multipass_config(secondary_search_settings)
# secondary_large_chunks_enabled = multipass_config_2.enable_large_chunks
# Break bad state for thrashing indexes
if secondary_search_settings and DISABLE_INDEX_UPDATE_ON_SWAP:
@ -122,10 +134,8 @@ def setup_onyx(
# takes a bit of time to start up
logger.notice("Verifying Document Index(s) is/are available.")
document_index = get_default_document_index(
primary_index_name=search_settings.index_name,
secondary_index_name=secondary_search_settings.index_name
if secondary_search_settings
else None,
search_settings,
secondary_search_settings,
)
success = setup_vespa(

View File

@ -7,6 +7,7 @@ from sqlalchemy.orm import Session
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.search_settings import get_active_search_settings
# Modify sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
@ -38,7 +39,6 @@ from onyx.db.connector_credential_pair import (
from onyx.db.engine import get_session_context_manager
from onyx.document_index.factory import get_default_document_index
from onyx.file_store.file_store import get_default_file_store
from onyx.document_index.document_index_utils import get_both_index_names
# pylint: enable=E402
# flake8: noqa: E402
@ -191,9 +191,10 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
)
try:
logger.notice("Deleting information from Vespa and Postgres")
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
active_search_settings = get_active_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
active_search_settings.primary,
active_search_settings.secondary,
)
files_deleted_count = _unsafe_deletion(

View File

@ -5,6 +5,8 @@ import sys
from sqlalchemy import text
from sqlalchemy.orm import Session
from onyx.document_index.document_index_utils import get_multipass_config
# makes it so `PYTHONPATH=.` is not required when running this script
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
@ -54,8 +56,14 @@ def main() -> None:
# Setup Vespa index
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
vespa_index = VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
)
# Delete chunks from Vespa first
print("Deleting orphaned document chunks from Vespa")

View File

@ -16,6 +16,7 @@ from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.db.engine import get_session_context_manager
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.vespa.index import VespaIndex
from onyx.indexing.indexing_pipeline import IndexBatchParams
from onyx.indexing.models import ChunkEmbedding
@ -133,10 +134,16 @@ def seed_dummy_docs(
) -> None:
with get_session_context_manager() as db_session:
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
embedding_dim = search_settings.model_dim
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
vespa_index = VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
)
print(index_name)
all_chunks = []

View File

@ -9,6 +9,7 @@ from onyx.configs.model_configs import DOC_EMBEDDING_DIM
from onyx.context.search.models import IndexFilters
from onyx.db.engine import get_session_context_manager
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.vespa.index import VespaIndex
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
@ -62,9 +63,15 @@ def test_hybrid_retrieval_times(
) -> None:
with get_session_context_manager() as db_session:
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
vespa_index = VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
)
# Generate random queries
queries = [f"Random Query {i}" for i in range(number_of_queries)]

View File

@ -18,6 +18,7 @@ from onyx.db.engine import get_session_with_tenant
from onyx.db.engine import SYNC_DB_API
from onyx.db.search_settings import get_current_search_settings
from onyx.db.swap_index import check_index_swap
from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.vespa.index import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa.index import VespaIndex
from onyx.indexing.models import IndexingSetting
@ -173,10 +174,16 @@ def reset_vespa() -> None:
check_index_swap(db_session)
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
success = setup_vespa(
document_index=VespaIndex(index_name=index_name, secondary_index_name=None),
document_index=VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
),
index_setting=IndexingSetting.from_db_model(search_settings),
secondary_index_setting=None,
)
@ -250,10 +257,16 @@ def reset_vespa_multitenant() -> None:
check_index_swap(db_session)
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
success = setup_vespa(
document_index=VespaIndex(index_name=index_name, secondary_index_name=None),
document_index=VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
),
index_setting=IndexingSetting.from_db_model(search_settings),
secondary_index_setting=None,
)

View File

@ -6,7 +6,7 @@ import pytest
from sqlalchemy.orm import Session
from onyx.db.engine import get_sqlalchemy_engine
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.document_index.document_index_utils import get_both_index_properties
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
@ -19,7 +19,7 @@ def test_vespa_update() -> None:
doc_id = "test-vespa-update"
with Session(get_sqlalchemy_engine()) as db_session:
primary_index_name, _ = get_both_index_names(db_session)
primary_index_name, _, _, _ = get_both_index_properties(db_session)
endpoint = (
f"{DOCUMENT_ID_ENDPOINT.format(index_name=primary_index_name)}/{doc_id}"
)