diff --git a/backend/alembic/versions/dbaa756c2ccf_embedding_models.py b/backend/alembic/versions/dbaa756c2ccf_embedding_models.py index df13ff7962..91eef429a1 100644 --- a/backend/alembic/versions/dbaa756c2ccf_embedding_models.py +++ b/backend/alembic/versions/dbaa756c2ccf_embedding_models.py @@ -10,12 +10,19 @@ from alembic import op import sqlalchemy as sa from sqlalchemy import table, column, String, Integer, Boolean -from onyx.db.search_settings import ( - get_new_default_embedding_model, - get_old_default_embedding_model, - user_has_overridden_embedding_model, -) +from onyx.configs.model_configs import ASYM_PASSAGE_PREFIX +from onyx.configs.model_configs import ASYM_QUERY_PREFIX +from onyx.configs.model_configs import DOC_EMBEDDING_DIM +from onyx.configs.model_configs import DOCUMENT_ENCODER_MODEL +from onyx.configs.model_configs import NORMALIZE_EMBEDDINGS +from onyx.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL +from onyx.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM +from onyx.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS +from onyx.db.enums import EmbeddingPrecision from onyx.db.models import IndexModelStatus +from onyx.db.search_settings import user_has_overridden_embedding_model +from onyx.indexing.models import IndexingSetting +from onyx.natural_language_processing.search_nlp_models import clean_model_name # revision identifiers, used by Alembic. revision = "dbaa756c2ccf" @@ -24,6 +31,47 @@ branch_labels: None = None depends_on: None = None +def _get_old_default_embedding_model() -> IndexingSetting: + is_overridden = user_has_overridden_embedding_model() + return IndexingSetting( + model_name=( + DOCUMENT_ENCODER_MODEL + if is_overridden + else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL + ), + model_dim=( + DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM + ), + embedding_precision=(EmbeddingPrecision.FLOAT), + normalize=( + NORMALIZE_EMBEDDINGS + if is_overridden + else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS + ), + query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""), + passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""), + index_name="danswer_chunk", + multipass_indexing=False, + enable_contextual_rag=False, + api_url=None, + ) + + +def _get_new_default_embedding_model() -> IndexingSetting: + return IndexingSetting( + model_name=DOCUMENT_ENCODER_MODEL, + model_dim=DOC_EMBEDDING_DIM, + embedding_precision=(EmbeddingPrecision.BFLOAT16), + normalize=NORMALIZE_EMBEDDINGS, + query_prefix=ASYM_QUERY_PREFIX, + passage_prefix=ASYM_PASSAGE_PREFIX, + index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}", + multipass_indexing=False, + enable_contextual_rag=False, + api_url=None, + ) + + def upgrade() -> None: op.create_table( "embedding_model", @@ -61,7 +109,7 @@ def upgrade() -> None: # the user selected via env variables before this change. This is needed since # all index_attempts must be associated with an embedding model, so without this # we will run into violations of non-null contraints - old_embedding_model = get_old_default_embedding_model() + old_embedding_model = _get_old_default_embedding_model() op.bulk_insert( EmbeddingModel, [ @@ -79,7 +127,7 @@ def upgrade() -> None: # if the user has not overridden the default embedding model via env variables, # insert the new default model into the database to auto-upgrade them if not user_has_overridden_embedding_model(): - new_embedding_model = get_new_default_embedding_model() + new_embedding_model = _get_new_default_embedding_model() op.bulk_insert( EmbeddingModel, [ diff --git a/backend/onyx/configs/embedding_configs.py b/backend/onyx/configs/embedding_configs.py new file mode 100644 index 0000000000..065a82c481 --- /dev/null +++ b/backend/onyx/configs/embedding_configs.py @@ -0,0 +1,156 @@ +from pydantic import BaseModel + +from onyx.db.enums import EmbeddingPrecision + + +class _BaseEmbeddingModel(BaseModel): + """Private model for defining base embedding model configurations.""" + + name: str + dim: int + index_name: str + + +class SupportedEmbeddingModel(BaseModel): + name: str + dim: int + index_name: str + embedding_precision: EmbeddingPrecision + + +# Base embedding model configurations (without precision) +_BASE_EMBEDDING_MODELS = [ + # Cloud-based models + _BaseEmbeddingModel( + name="cohere/embed-english-v3.0", + dim=1024, + index_name="danswer_chunk_cohere_embed_english_v3_0", + ), + _BaseEmbeddingModel( + name="cohere/embed-english-v3.0", + dim=1024, + index_name="danswer_chunk_embed_english_v3_0", + ), + _BaseEmbeddingModel( + name="cohere/embed-english-light-v3.0", + dim=384, + index_name="danswer_chunk_cohere_embed_english_light_v3_0", + ), + _BaseEmbeddingModel( + name="cohere/embed-english-light-v3.0", + dim=384, + index_name="danswer_chunk_embed_english_light_v3_0", + ), + _BaseEmbeddingModel( + name="openai/text-embedding-3-large", + dim=3072, + index_name="danswer_chunk_openai_text_embedding_3_large", + ), + _BaseEmbeddingModel( + name="openai/text-embedding-3-large", + dim=3072, + index_name="danswer_chunk_text_embedding_3_large", + ), + _BaseEmbeddingModel( + name="openai/text-embedding-3-small", + dim=1536, + index_name="danswer_chunk_openai_text_embedding_3_small", + ), + _BaseEmbeddingModel( + name="openai/text-embedding-3-small", + dim=1536, + index_name="danswer_chunk_text_embedding_3_small", + ), + _BaseEmbeddingModel( + name="google/text-embedding-005", + dim=768, + index_name="danswer_chunk_google_text_embedding_005", + ), + _BaseEmbeddingModel( + name="google/textembedding-gecko@003", + dim=768, + index_name="danswer_chunk_google_textembedding_gecko_003", + ), + _BaseEmbeddingModel( + name="google/textembedding-gecko@003", + dim=768, + index_name="danswer_chunk_textembedding_gecko_003", + ), + _BaseEmbeddingModel( + name="voyage/voyage-large-2-instruct", + dim=1024, + index_name="danswer_chunk_voyage_large_2_instruct", + ), + _BaseEmbeddingModel( + name="voyage/voyage-large-2-instruct", + dim=1024, + index_name="danswer_chunk_large_2_instruct", + ), + _BaseEmbeddingModel( + name="voyage/voyage-light-2-instruct", + dim=384, + index_name="danswer_chunk_voyage_light_2_instruct", + ), + _BaseEmbeddingModel( + name="voyage/voyage-light-2-instruct", + dim=384, + index_name="danswer_chunk_light_2_instruct", + ), + # Self-hosted models + _BaseEmbeddingModel( + name="nomic-ai/nomic-embed-text-v1", + dim=768, + index_name="danswer_chunk_nomic_ai_nomic_embed_text_v1", + ), + _BaseEmbeddingModel( + name="nomic-ai/nomic-embed-text-v1", + dim=768, + index_name="danswer_chunk_nomic_embed_text_v1", + ), + _BaseEmbeddingModel( + name="intfloat/e5-base-v2", + dim=768, + index_name="danswer_chunk_intfloat_e5_base_v2", + ), + _BaseEmbeddingModel( + name="intfloat/e5-small-v2", + dim=384, + index_name="danswer_chunk_intfloat_e5_small_v2", + ), + _BaseEmbeddingModel( + name="intfloat/multilingual-e5-base", + dim=768, + index_name="danswer_chunk_intfloat_multilingual_e5_base", + ), + _BaseEmbeddingModel( + name="intfloat/multilingual-e5-small", + dim=384, + index_name="danswer_chunk_intfloat_multilingual_e5_small", + ), +] + +# Automatically generate both FLOAT and BFLOAT16 versions of all models +SUPPORTED_EMBEDDING_MODELS = [ + # BFLOAT16 precision versions + *[ + SupportedEmbeddingModel( + name=model.name, + dim=model.dim, + index_name=f"{model.index_name}_bfloat16", + embedding_precision=EmbeddingPrecision.BFLOAT16, + ) + for model in _BASE_EMBEDDING_MODELS + ], + # FLOAT precision versions + # NOTE: need to keep this one for backwards compatibility. We now default to + # BFLOAT16. + *[ + SupportedEmbeddingModel( + name=model.name, + dim=model.dim, + index_name=model.index_name, + embedding_precision=EmbeddingPrecision.FLOAT, + ) + for model in _BASE_EMBEDDING_MODELS + ], +] diff --git a/backend/onyx/db/search_settings.py b/backend/onyx/db/search_settings.py index 1d21d0d0e6..8ba092f532 100644 --- a/backend/onyx/db/search_settings.py +++ b/backend/onyx/db/search_settings.py @@ -3,25 +3,15 @@ from sqlalchemy import delete from sqlalchemy import select from sqlalchemy.orm import Session -from onyx.configs.model_configs import ASYM_PASSAGE_PREFIX -from onyx.configs.model_configs import ASYM_QUERY_PREFIX from onyx.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL -from onyx.configs.model_configs import DOC_EMBEDDING_DIM from onyx.configs.model_configs import DOCUMENT_ENCODER_MODEL -from onyx.configs.model_configs import NORMALIZE_EMBEDDINGS -from onyx.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL -from onyx.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM -from onyx.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS from onyx.context.search.models import SavedSearchSettings from onyx.db.engine import get_session_with_current_tenant -from onyx.db.enums import EmbeddingPrecision from onyx.db.llm import fetch_embedding_provider from onyx.db.models import CloudEmbeddingProvider from onyx.db.models import IndexAttempt from onyx.db.models import IndexModelStatus from onyx.db.models import SearchSettings -from onyx.indexing.models import IndexingSetting -from onyx.natural_language_processing.search_nlp_models import clean_model_name from onyx.natural_language_processing.search_nlp_models import warm_up_cross_encoder from onyx.server.manage.embedding.models import ( CloudEmbeddingProvider as ServerCloudEmbeddingProvider, @@ -264,79 +254,3 @@ def update_search_settings_status( def user_has_overridden_embedding_model() -> bool: return DOCUMENT_ENCODER_MODEL != DEFAULT_DOCUMENT_ENCODER_MODEL - - -def get_old_default_search_settings() -> SearchSettings: - is_overridden = user_has_overridden_embedding_model() - return SearchSettings( - model_name=( - DOCUMENT_ENCODER_MODEL - if is_overridden - else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL - ), - model_dim=( - DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM - ), - normalize=( - NORMALIZE_EMBEDDINGS - if is_overridden - else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS - ), - query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""), - passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""), - status=IndexModelStatus.PRESENT, - index_name="danswer_chunk", - ) - - -def get_new_default_search_settings(is_present: bool) -> SearchSettings: - return SearchSettings( - model_name=DOCUMENT_ENCODER_MODEL, - model_dim=DOC_EMBEDDING_DIM, - normalize=NORMALIZE_EMBEDDINGS, - query_prefix=ASYM_QUERY_PREFIX, - passage_prefix=ASYM_PASSAGE_PREFIX, - status=IndexModelStatus.PRESENT if is_present else IndexModelStatus.FUTURE, - index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}", - ) - - -def get_old_default_embedding_model() -> IndexingSetting: - is_overridden = user_has_overridden_embedding_model() - return IndexingSetting( - model_name=( - DOCUMENT_ENCODER_MODEL - if is_overridden - else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL - ), - model_dim=( - DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM - ), - embedding_precision=(EmbeddingPrecision.FLOAT), - normalize=( - NORMALIZE_EMBEDDINGS - if is_overridden - else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS - ), - query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""), - passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""), - index_name="danswer_chunk", - multipass_indexing=False, - enable_contextual_rag=False, - api_url=None, - ) - - -def get_new_default_embedding_model() -> IndexingSetting: - return IndexingSetting( - model_name=DOCUMENT_ENCODER_MODEL, - model_dim=DOC_EMBEDDING_DIM, - embedding_precision=(EmbeddingPrecision.FLOAT), - normalize=NORMALIZE_EMBEDDINGS, - query_prefix=ASYM_QUERY_PREFIX, - passage_prefix=ASYM_PASSAGE_PREFIX, - index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}", - multipass_indexing=False, - enable_contextual_rag=False, - api_url=None, - ) diff --git a/backend/onyx/setup.py b/backend/onyx/setup.py index 570fbf1112..0a53f6885a 100644 --- a/backend/onyx/setup.py +++ b/backend/onyx/setup.py @@ -7,6 +7,8 @@ from onyx.configs.app_configs import MANAGED_VESPA from onyx.configs.app_configs import VESPA_NUM_ATTEMPTS_ON_STARTUP from onyx.configs.constants import KV_REINDEX_KEY from onyx.configs.constants import KV_SEARCH_SETTINGS +from onyx.configs.embedding_configs import SUPPORTED_EMBEDDING_MODELS +from onyx.configs.embedding_configs import SupportedEmbeddingModel from onyx.configs.model_configs import FAST_GEN_AI_MODEL_VERSION from onyx.configs.model_configs import GEN_AI_API_KEY from onyx.configs.model_configs import GEN_AI_MODEL_VERSION @@ -59,8 +61,6 @@ from shared_configs.configs import ALT_INDEX_SUFFIX from shared_configs.configs import MODEL_SERVER_HOST from shared_configs.configs import MODEL_SERVER_PORT from shared_configs.configs import MULTI_TENANT -from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS -from shared_configs.model_server_models import SupportedEmbeddingModel logger = setup_logger() diff --git a/backend/scripts/debugging/onyx_vespa_schemas.py b/backend/scripts/debugging/onyx_vespa_schemas.py index 5acad609f9..b556c48f1b 100644 --- a/backend/scripts/debugging/onyx_vespa_schemas.py +++ b/backend/scripts/debugging/onyx_vespa_schemas.py @@ -4,21 +4,26 @@ import argparse import jinja2 +from onyx.configs.embedding_configs import SUPPORTED_EMBEDDING_MODELS from onyx.db.enums import EmbeddingPrecision from onyx.utils.logger import setup_logger -from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS logger = setup_logger() -def write_schema(index_name: str, dim: int, template: jinja2.Template) -> None: +def write_schema( + index_name: str, + dim: int, + embedding_precision: EmbeddingPrecision, + template: jinja2.Template, +) -> None: index_filename = index_name + ".sd" schema = template.render( multi_tenant=True, schema_name=index_name, dim=dim, - embedding_precision=EmbeddingPrecision.FLOAT.value, + embedding_precision=embedding_precision.value, ) with open(index_filename, "w", encoding="utf-8") as f: @@ -41,8 +46,13 @@ def main() -> None: num_indexes = 0 for model in SUPPORTED_EMBEDDING_MODELS: - write_schema(model.index_name, model.dim, template) - write_schema(model.index_name + "__danswer_alt_index", model.dim, template) + write_schema(model.index_name, model.dim, model.embedding_precision, template) + write_schema( + model.index_name + "__danswer_alt_index", + model.dim, + model.embedding_precision, + template, + ) num_indexes += 2 logger.info(f"Wrote {num_indexes} indexes.") diff --git a/backend/shared_configs/configs.py b/backend/shared_configs/configs.py index 90661d8231..12d12a4969 100644 --- a/backend/shared_configs/configs.py +++ b/backend/shared_configs/configs.py @@ -3,8 +3,6 @@ from typing import Any from typing import List from urllib.parse import urlparse -from shared_configs.model_server_models import SupportedEmbeddingModel - # Used for logging SLACK_CHANNEL_ID = "channel_id" @@ -170,120 +168,6 @@ IGNORED_SYNCING_TENANT_LIST = ( else None ) -SUPPORTED_EMBEDDING_MODELS = [ - # Cloud-based models - SupportedEmbeddingModel( - name="cohere/embed-english-v3.0", - dim=1024, - index_name="danswer_chunk_cohere_embed_english_v3_0", - ), - SupportedEmbeddingModel( - name="cohere/embed-english-v3.0", - dim=1024, - index_name="danswer_chunk_embed_english_v3_0", - ), - SupportedEmbeddingModel( - name="cohere/embed-english-light-v3.0", - dim=384, - index_name="danswer_chunk_cohere_embed_english_light_v3_0", - ), - SupportedEmbeddingModel( - name="cohere/embed-english-light-v3.0", - dim=384, - index_name="danswer_chunk_embed_english_light_v3_0", - ), - SupportedEmbeddingModel( - name="openai/text-embedding-3-large", - dim=3072, - index_name="danswer_chunk_openai_text_embedding_3_large", - ), - SupportedEmbeddingModel( - name="openai/text-embedding-3-large", - dim=3072, - index_name="danswer_chunk_text_embedding_3_large", - ), - SupportedEmbeddingModel( - name="openai/text-embedding-3-small", - dim=1536, - index_name="danswer_chunk_openai_text_embedding_3_small", - ), - SupportedEmbeddingModel( - name="openai/text-embedding-3-small", - dim=1536, - index_name="danswer_chunk_text_embedding_3_small", - ), - SupportedEmbeddingModel( - name="google/text-embedding-005", - dim=768, - index_name="danswer_chunk_google_text_embedding_004", - ), - SupportedEmbeddingModel( - name="google/text-embedding-005", - dim=768, - index_name="danswer_chunk_text_embedding_004", - ), - SupportedEmbeddingModel( - name="google/textembedding-gecko@003", - dim=768, - index_name="danswer_chunk_google_textembedding_gecko_003", - ), - SupportedEmbeddingModel( - name="google/textembedding-gecko@003", - dim=768, - index_name="danswer_chunk_textembedding_gecko_003", - ), - SupportedEmbeddingModel( - name="voyage/voyage-large-2-instruct", - dim=1024, - index_name="danswer_chunk_voyage_large_2_instruct", - ), - SupportedEmbeddingModel( - name="voyage/voyage-large-2-instruct", - dim=1024, - index_name="danswer_chunk_large_2_instruct", - ), - SupportedEmbeddingModel( - name="voyage/voyage-light-2-instruct", - dim=384, - index_name="danswer_chunk_voyage_light_2_instruct", - ), - SupportedEmbeddingModel( - name="voyage/voyage-light-2-instruct", - dim=384, - index_name="danswer_chunk_light_2_instruct", - ), - # Self-hosted models - SupportedEmbeddingModel( - name="nomic-ai/nomic-embed-text-v1", - dim=768, - index_name="danswer_chunk_nomic_ai_nomic_embed_text_v1", - ), - SupportedEmbeddingModel( - name="nomic-ai/nomic-embed-text-v1", - dim=768, - index_name="danswer_chunk_nomic_embed_text_v1", - ), - SupportedEmbeddingModel( - name="intfloat/e5-base-v2", - dim=768, - index_name="danswer_chunk_intfloat_e5_base_v2", - ), - SupportedEmbeddingModel( - name="intfloat/e5-small-v2", - dim=384, - index_name="danswer_chunk_intfloat_e5_small_v2", - ), - SupportedEmbeddingModel( - name="intfloat/multilingual-e5-base", - dim=768, - index_name="danswer_chunk_intfloat_multilingual_e5_base", - ), - SupportedEmbeddingModel( - name="intfloat/multilingual-e5-small", - dim=384, - index_name="danswer_chunk_intfloat_multilingual_e5_small", - ), -] # Maximum (least severe) downgrade factor for chunks above the cutoff INDEXING_INFORMATION_CONTENT_CLASSIFICATION_MAX = float( os.environ.get("INDEXING_INFORMATION_CONTENT_CLASSIFICATION_MAX") or 1.0 diff --git a/backend/shared_configs/model_server_models.py b/backend/shared_configs/model_server_models.py index 4c9c1be1e3..202ce24e55 100644 --- a/backend/shared_configs/model_server_models.py +++ b/backend/shared_configs/model_server_models.py @@ -78,12 +78,6 @@ class InformationContentClassificationRequests(BaseModel): queries: list[str] -class SupportedEmbeddingModel(BaseModel): - name: str - dim: int - index_name: str - - class ContentClassificationPrediction(BaseModel): predicted_label: int content_boost_factor: float diff --git a/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx b/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx index 706faff7c4..58f508b6b2 100644 --- a/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx +++ b/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx @@ -71,7 +71,7 @@ export default function EmbeddingForm() { disable_rerank_for_streaming: false, api_url: null, num_rerank: 0, - embedding_precision: EmbeddingPrecision.FLOAT, + embedding_precision: EmbeddingPrecision.BFLOAT16, reduced_dimension: null, });