mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-05 17:30:26 +02:00
* add timings for syncing * add more logging * more debugging * refactor multipass/db check out of VespaIndex * circular imports? * more debugging * add logs * various improvements * additional logs to narrow down issue * use global httpx pool for the main vespa flows in celery. Use in more places eventually. * cleanup debug logging, etc * remove debug logging * this should use the secondary index * mypy * missed some logging * review fixes * refactor get_default_document_index to use search settings * more missed logging * fix circular refs --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app> Co-authored-by: pablodanswer <pablo@danswer.ai>
198 lines
6.9 KiB
Python
198 lines
6.9 KiB
Python
import math
|
|
import uuid
|
|
from uuid import UUID
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
|
|
from onyx.db.models import SearchSettings
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.db.search_settings import get_secondary_search_settings
|
|
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
|
|
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
|
from onyx.indexing.models import MultipassConfig
|
|
from shared_configs.configs import MULTI_TENANT
|
|
|
|
DEFAULT_BATCH_SIZE = 30
|
|
DEFAULT_INDEX_NAME = "danswer_chunk"
|
|
|
|
|
|
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
|
|
"""
|
|
Determines whether multipass should be used based on the search settings
|
|
or the default config if settings are unavailable.
|
|
"""
|
|
if search_settings is not None:
|
|
return search_settings.multipass_indexing
|
|
return ENABLE_MULTIPASS_INDEXING
|
|
|
|
|
|
def get_multipass_config(search_settings: SearchSettings) -> MultipassConfig:
|
|
"""
|
|
Determines whether to enable multipass and large chunks by examining
|
|
the current search settings and the embedder configuration.
|
|
"""
|
|
if not search_settings:
|
|
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
|
|
|
|
multipass = should_use_multipass(search_settings)
|
|
enable_large_chunks = SearchSettings.can_use_large_chunks(
|
|
multipass, search_settings.model_name, search_settings.provider_type
|
|
)
|
|
return MultipassConfig(
|
|
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
|
|
)
|
|
|
|
|
|
def get_both_index_properties(
|
|
db_session: Session,
|
|
) -> tuple[str, str | None, bool, bool | None]:
|
|
search_settings = get_current_search_settings(db_session)
|
|
config_1 = get_multipass_config(search_settings)
|
|
|
|
search_settings_new = get_secondary_search_settings(db_session)
|
|
if not search_settings_new:
|
|
return search_settings.index_name, None, config_1.enable_large_chunks, None
|
|
|
|
config_2 = get_multipass_config(search_settings)
|
|
return (
|
|
search_settings.index_name,
|
|
search_settings_new.index_name,
|
|
config_1.enable_large_chunks,
|
|
config_2.enable_large_chunks,
|
|
)
|
|
|
|
|
|
def translate_boost_count_to_multiplier(boost: int) -> float:
|
|
"""Mapping boost integer values to a multiplier according to a sigmoid curve
|
|
Piecewise such that at many downvotes, its 0.5x the score and with many upvotes
|
|
it is 2x the score. This should be in line with the Vespa calculation."""
|
|
# 3 in the equation below stretches it out to hit asymptotes slower
|
|
if boost < 0:
|
|
# 0.5 + sigmoid -> range of 0.5 to 1
|
|
return 0.5 + (1 / (1 + math.exp(-1 * boost / 3)))
|
|
|
|
# 2 x sigmoid -> range of 1 to 2
|
|
return 2 / (1 + math.exp(-1 * boost / 3))
|
|
|
|
|
|
# Assembles a list of Vespa chunk IDs for a document
|
|
# given the required context. This can be used to directly query
|
|
# Vespa's Document API.
|
|
def get_document_chunk_ids(
|
|
enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
|
|
tenant_id: str | None,
|
|
large_chunks_enabled: bool,
|
|
) -> list[UUID]:
|
|
doc_chunk_ids = []
|
|
|
|
for enriched_document_info in enriched_document_info_list:
|
|
for chunk_index in range(
|
|
enriched_document_info.chunk_start_index,
|
|
enriched_document_info.chunk_end_index,
|
|
):
|
|
if not enriched_document_info.old_version:
|
|
doc_chunk_ids.append(
|
|
get_uuid_from_chunk_info(
|
|
document_id=enriched_document_info.doc_id,
|
|
chunk_id=chunk_index,
|
|
tenant_id=tenant_id,
|
|
)
|
|
)
|
|
else:
|
|
doc_chunk_ids.append(
|
|
get_uuid_from_chunk_info_old(
|
|
document_id=enriched_document_info.doc_id,
|
|
chunk_id=chunk_index,
|
|
)
|
|
)
|
|
|
|
if large_chunks_enabled and chunk_index % 4 == 0:
|
|
large_chunk_id = int(chunk_index / 4)
|
|
large_chunk_reference_ids = [
|
|
large_chunk_id + i
|
|
for i in range(4)
|
|
if large_chunk_id + i < enriched_document_info.chunk_end_index
|
|
]
|
|
if enriched_document_info.old_version:
|
|
doc_chunk_ids.append(
|
|
get_uuid_from_chunk_info_old(
|
|
document_id=enriched_document_info.doc_id,
|
|
chunk_id=large_chunk_id,
|
|
large_chunk_reference_ids=large_chunk_reference_ids,
|
|
)
|
|
)
|
|
else:
|
|
doc_chunk_ids.append(
|
|
get_uuid_from_chunk_info(
|
|
document_id=enriched_document_info.doc_id,
|
|
chunk_id=large_chunk_id,
|
|
tenant_id=tenant_id,
|
|
large_chunk_id=large_chunk_id,
|
|
)
|
|
)
|
|
|
|
return doc_chunk_ids
|
|
|
|
|
|
def get_uuid_from_chunk_info(
|
|
*,
|
|
document_id: str,
|
|
chunk_id: int,
|
|
tenant_id: str | None,
|
|
large_chunk_id: int | None = None,
|
|
) -> UUID:
|
|
doc_str = document_id
|
|
|
|
# Web parsing URL duplicate catching
|
|
if doc_str and doc_str[-1] == "/":
|
|
doc_str = doc_str[:-1]
|
|
|
|
chunk_index = (
|
|
"large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
|
|
)
|
|
unique_identifier_string = "_".join([doc_str, chunk_index])
|
|
if tenant_id and MULTI_TENANT:
|
|
unique_identifier_string += "_" + tenant_id
|
|
|
|
uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
|
return uuid_value
|
|
|
|
|
|
def get_uuid_from_chunk_info_old(
|
|
*, document_id: str, chunk_id: int, large_chunk_reference_ids: list[int] = []
|
|
) -> UUID:
|
|
doc_str = document_id
|
|
|
|
# Web parsing URL duplicate catching
|
|
if doc_str and doc_str[-1] == "/":
|
|
doc_str = doc_str[:-1]
|
|
unique_identifier_string = "_".join([doc_str, str(chunk_id), "0"])
|
|
if large_chunk_reference_ids:
|
|
unique_identifier_string += "_large" + "_".join(
|
|
[
|
|
str(referenced_chunk_id)
|
|
for referenced_chunk_id in large_chunk_reference_ids
|
|
]
|
|
)
|
|
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
|
|
|
|
|
def get_uuid_from_chunk(chunk: DocMetadataAwareIndexChunk) -> uuid.UUID:
|
|
return get_uuid_from_chunk_info(
|
|
document_id=chunk.source_document.id,
|
|
chunk_id=chunk.chunk_id,
|
|
tenant_id=chunk.tenant_id,
|
|
large_chunk_id=chunk.large_chunk_id,
|
|
)
|
|
|
|
|
|
def get_uuid_from_chunk_old(
|
|
chunk: DocMetadataAwareIndexChunk, large_chunk_reference_ids: list[int] = []
|
|
) -> UUID:
|
|
return get_uuid_from_chunk_info_old(
|
|
document_id=chunk.source_document.id,
|
|
chunk_id=chunk.chunk_id,
|
|
large_chunk_reference_ids=large_chunk_reference_ids,
|
|
)
|