mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-06 18:00:29 +02:00
61 lines
2.0 KiB
Python
61 lines
2.0 KiB
Python
import math
|
|
import uuid
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from onyx.context.search.models import InferenceChunk
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.db.search_settings import get_secondary_search_settings
|
|
from onyx.indexing.models import IndexChunk
|
|
|
|
|
|
DEFAULT_BATCH_SIZE = 30
|
|
DEFAULT_INDEX_NAME = "danswer_chunk"
|
|
|
|
|
|
def get_both_index_names(db_session: Session) -> tuple[str, str | None]:
|
|
search_settings = get_current_search_settings(db_session)
|
|
|
|
search_settings_new = get_secondary_search_settings(db_session)
|
|
if not search_settings_new:
|
|
return search_settings.index_name, None
|
|
|
|
return search_settings.index_name, search_settings_new.index_name
|
|
|
|
|
|
def translate_boost_count_to_multiplier(boost: int) -> float:
|
|
"""Mapping boost integer values to a multiplier according to a sigmoid curve
|
|
Piecewise such that at many downvotes, its 0.5x the score and with many upvotes
|
|
it is 2x the score. This should be in line with the Vespa calculation."""
|
|
# 3 in the equation below stretches it out to hit asymptotes slower
|
|
if boost < 0:
|
|
# 0.5 + sigmoid -> range of 0.5 to 1
|
|
return 0.5 + (1 / (1 + math.exp(-1 * boost / 3)))
|
|
|
|
# 2 x sigmoid -> range of 1 to 2
|
|
return 2 / (1 + math.exp(-1 * boost / 3))
|
|
|
|
|
|
def get_uuid_from_chunk(
|
|
chunk: IndexChunk | InferenceChunk, mini_chunk_ind: int = 0
|
|
) -> uuid.UUID:
|
|
doc_str = (
|
|
chunk.document_id
|
|
if isinstance(chunk, InferenceChunk)
|
|
else chunk.source_document.id
|
|
)
|
|
# Web parsing URL duplicate catching
|
|
if doc_str and doc_str[-1] == "/":
|
|
doc_str = doc_str[:-1]
|
|
unique_identifier_string = "_".join(
|
|
[doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
|
|
)
|
|
if chunk.large_chunk_reference_ids:
|
|
unique_identifier_string += "_large" + "_".join(
|
|
[
|
|
str(referenced_chunk_id)
|
|
for referenced_chunk_id in chunk.large_chunk_reference_ids
|
|
]
|
|
)
|
|
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|