mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-30 01:30:45 +02:00
add timings for syncing (#3798)
* add timings for syncing * add more logging * more debugging * refactor multipass/db check out of VespaIndex * circular imports? * more debugging * add logs * various improvements * additional logs to narrow down issue * use global httpx pool for the main vespa flows in celery. Use in more places eventually. * cleanup debug logging, etc * remove debug logging * this should use the secondary index * mypy * missed some logging * review fixes * refactor get_default_document_index to use search settings * more missed logging * fix circular refs --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app> Co-authored-by: pablodanswer <pablo@danswer.ai>
This commit is contained in:
@ -7,6 +7,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
|
||||
# Modify sys.path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -38,7 +39,6 @@ from onyx.db.connector_credential_pair import (
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
|
||||
# pylint: enable=E402
|
||||
# flake8: noqa: E402
|
||||
@ -191,9 +191,10 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
|
||||
)
|
||||
try:
|
||||
logger.notice("Deleting information from Vespa and Postgres")
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
|
||||
active_search_settings.primary,
|
||||
active_search_settings.secondary,
|
||||
)
|
||||
|
||||
files_deleted_count = _unsafe_deletion(
|
||||
|
@ -5,6 +5,8 @@ import sys
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
|
||||
# makes it so `PYTHONPATH=.` is not required when running this script
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
@ -54,8 +56,14 @@ def main() -> None:
|
||||
|
||||
# Setup Vespa index
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
|
||||
# Delete chunks from Vespa first
|
||||
print("Deleting orphaned document chunks from Vespa")
|
||||
|
@ -16,6 +16,7 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from onyx.indexing.indexing_pipeline import IndexBatchParams
|
||||
from onyx.indexing.models import ChunkEmbedding
|
||||
@ -133,10 +134,16 @@ def seed_dummy_docs(
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
embedding_dim = search_settings.model_dim
|
||||
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
print(index_name)
|
||||
|
||||
all_chunks = []
|
||||
|
@ -9,6 +9,7 @@ from onyx.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
|
||||
@ -62,9 +63,15 @@ def test_hybrid_retrieval_times(
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
|
||||
# Generate random queries
|
||||
queries = [f"Random Query {i}" for i in range(number_of_queries)]
|
||||
|
Reference in New Issue
Block a user