danswer/backend/model_server/main.py

import os
import shutil
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from pathlib import Path

import sentry_sdk
import torch
import uvicorn
from fastapi import FastAPI
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.starlette import StarletteIntegration
from transformers import logging as transformer_logging  # type:ignore

from model_server.custom_models import router as custom_models_router
from model_server.custom_models import warm_up_information_content_model
from model_server.custom_models import warm_up_intent_model
from model_server.encoders import router as encoders_router
from model_server.management_endpoints import router as management_router
from model_server.utils import get_gpu_type
from onyx import __version__
from onyx.utils.logger import setup_logger
from shared_configs.configs import INDEXING_ONLY
from shared_configs.configs import MIN_THREADS_ML_MODELS
from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
from shared_configs.configs import MODEL_SERVER_PORT
from shared_configs.configs import SENTRY_DSN

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/huggingface"
TEMP_HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/temp_huggingface"

transformer_logging.set_verbosity_error()

logger = setup_logger()


def _move_files_recursively(source: Path, dest: Path, overwrite: bool = False) -> None:
    """
    This moves the files from the temp huggingface cache to the huggingface cache

    We have to move each file individually because the directories might
    have the same name but not the same contents and we dont want to remove
    the files in the existing huggingface cache that don't exist in the temp
    huggingface cache.
    """

    for item in source.iterdir():
        target_path = dest / item.relative_to(source)
        if item.is_dir():
            _move_files_recursively(item, target_path, overwrite)
        else:
            target_path.parent.mkdir(parents=True, exist_ok=True)
            if target_path.exists() and not overwrite:
                continue
            shutil.move(str(item), str(target_path))


@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator:
    gpu_type = get_gpu_type()
    logger.notice(f"Torch GPU Detection: gpu_type={gpu_type}")

    app.state.gpu_type = gpu_type

    if TEMP_HF_CACHE_PATH.is_dir():
        logger.notice("Moving contents of temp_huggingface to huggingface cache.")
        _move_files_recursively(TEMP_HF_CACHE_PATH, HF_CACHE_PATH)
        shutil.rmtree(TEMP_HF_CACHE_PATH, ignore_errors=True)
        logger.notice("Moved contents of temp_huggingface to huggingface cache.")

    torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads()))
    logger.notice(f"Torch Threads: {torch.get_num_threads()}")

    if not INDEXING_ONLY:
        logger.notice(
            "The intent model should run on the model server. The information content model should not run here."
        )
        warm_up_intent_model()
    else:
        logger.notice(
            "The content information model should run on the indexing model server. The intent model should not run here."
        )
        warm_up_information_content_model()

    yield


def get_model_app() -> FastAPI:
    application = FastAPI(
        title="Onyx Model Server", version=__version__, lifespan=lifespan
    )
    if SENTRY_DSN:
        sentry_sdk.init(
            dsn=SENTRY_DSN,
            integrations=[StarletteIntegration(), FastApiIntegration()],
            traces_sample_rate=0.1,
        )
        logger.info("Sentry initialized")
    else:
        logger.debug("Sentry DSN not provided, skipping Sentry initialization")

    application.include_router(management_router)
    application.include_router(encoders_router)
    application.include_router(custom_models_router)

    return application


app = get_model_app()


if __name__ == "__main__":
    logger.notice(
        f"Starting Onyx Model Server on http://{MODEL_SERVER_ALLOWED_HOST}:{str(MODEL_SERVER_PORT)}/"
    )
    logger.notice(f"Model Server Version: {__version__}")
    uvicorn.run(app, host=MODEL_SERVER_ALLOWED_HOST, port=MODEL_SERVER_PORT)
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`import os`
Changed default local model to nomic (#1943) 2024-07-31 18:54:02 -07:00			`import shutil`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`from collections.abc import AsyncGenerator`
			`from contextlib import asynccontextmanager`
Changed default local model to nomic (#1943) 2024-07-31 18:54:02 -07:00			`from pathlib import Path`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00
add sentry (#2786) * add sentry * nit * nit * add requirement to ee * try to ensure sentry is installed in integration tests 2024-10-17 16:20:37 -07:00			`import sentry_sdk`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00			`import torch`
			`import uvicorn`
			`from fastapi import FastAPI`
add sentry (#2786) * add sentry * nit * nit * add requirement to ee * try to ensure sentry is installed in integration tests 2024-10-17 16:20:37 -07:00			`from sentry_sdk.integrations.fastapi import FastApiIntegration`
			`from sentry_sdk.integrations.starlette import StarletteIntegration`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`from transformers import logging as transformer_logging # type:ignore`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
			`from model_server.custom_models import router as custom_models_router`
Reduce ranking scores for short chunks without actual information (#4098) * remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering 2025-03-13 10:35:45 -07:00			`from model_server.custom_models import warm_up_information_content_model`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00			`from model_server.custom_models import warm_up_intent_model`
			`from model_server.encoders import router as encoders_router`
Healthcheck for model server (#1350) 2024-04-18 16:22:38 -07:00			`from model_server.management_endpoints import router as management_router`
improve gpu detection functions and logging in model server 2025-02-07 16:59:02 -08:00			`from model_server.utils import get_gpu_type`
welcome to onyx 2024-12-13 09:48:43 -08:00			`from onyx import __version__`
			`from onyx.utils.logger import setup_logger`
Fix Model Server (#1320) 2024-04-10 23:13:22 -07:00			`from shared_configs.configs import INDEXING_ONLY`
			`from shared_configs.configs import MIN_THREADS_ML_MODELS`
			`from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST`
			`from shared_configs.configs import MODEL_SERVER_PORT`
add sentry (#2786) * add sentry * nit * nit * add requirement to ee * try to ensure sentry is installed in integration tests 2024-10-17 16:20:37 -07:00			`from shared_configs.configs import SENTRY_DSN`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00
			`os.environ["TOKENIZERS_PARALLELISM"] = "false"`
			`os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
Remove hardcoded root path for HF models 2024-12-13 20:29:02 -06:00			`HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/huggingface"`
			`TEMP_HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/temp_huggingface"`
Speed up docker launch (#2099) * use move instead of copy * added logging * fix overwrites * tested throughly * fixes * clearer commenting 2024-08-12 17:45:05 -07:00
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`transformer_logging.set_verbosity_error()`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
			`logger = setup_logger()`


Speed up docker launch (#2099) * use move instead of copy * added logging * fix overwrites * tested throughly * fixes * clearer commenting 2024-08-12 17:45:05 -07:00			`def _move_files_recursively(source: Path, dest: Path, overwrite: bool = False) -> None:`
			`"""`
			`This moves the files from the temp huggingface cache to the huggingface cache`

			`We have to move each file individually because the directories might`
			`have the same name but not the same contents and we dont want to remove`
			`the files in the existing huggingface cache that don't exist in the temp`
			`huggingface cache.`
			`"""`
Auth specific rate limiting (#3463) * k * v1 * fully functional * finalize * nit * nit * nit * clean up with wrapper + comments * k * update * minor clean 2024-12-29 18:34:23 -05:00
Speed up docker launch (#2099) * use move instead of copy * added logging * fix overwrites * tested throughly * fixes * clearer commenting 2024-08-12 17:45:05 -07:00			`for item in source.iterdir():`
			`target_path = dest / item.relative_to(source)`
			`if item.is_dir():`
			`_move_files_recursively(item, target_path, overwrite)`
			`else:`
			`target_path.parent.mkdir(parents=True, exist_ok=True)`
			`if target_path.exists() and not overwrite:`
			`continue`
			`shutil.move(str(item), str(target_path))`
Changed default local model to nomic (#1943) 2024-07-31 18:54:02 -07:00

Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`@asynccontextmanager`
			`async def lifespan(app: FastAPI) -> AsyncGenerator:`
improve gpu detection functions and logging in model server 2025-02-07 16:59:02 -08:00			`gpu_type = get_gpu_type()`
fixes 2025-02-07 17:28:17 -08:00			`logger.notice(f"Torch GPU Detection: gpu_type={gpu_type}")`
improve gpu detection functions and logging in model server 2025-02-07 16:59:02 -08:00
			`app.state.gpu_type = gpu_type`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
Speed up docker launch (#2099) * use move instead of copy * added logging * fix overwrites * tested throughly * fixes * clearer commenting 2024-08-12 17:45:05 -07:00			`if TEMP_HF_CACHE_PATH.is_dir():`
Logging Level Update (#2165) 2024-08-18 21:53:40 -07:00			`logger.notice("Moving contents of temp_huggingface to huggingface cache.")`
Speed up docker launch (#2099) * use move instead of copy * added logging * fix overwrites * tested throughly * fixes * clearer commenting 2024-08-12 17:45:05 -07:00			`_move_files_recursively(TEMP_HF_CACHE_PATH, HF_CACHE_PATH)`
			`shutil.rmtree(TEMP_HF_CACHE_PATH, ignore_errors=True)`
Logging Level Update (#2165) 2024-08-18 21:53:40 -07:00			`logger.notice("Moved contents of temp_huggingface to huggingface cache.")`
Changed default local model to nomic (#1943) 2024-07-31 18:54:02 -07:00
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads()))`
Logging Level Update (#2165) 2024-08-18 21:53:40 -07:00			`logger.notice(f"Torch Threads: {torch.get_num_threads()}")`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`if not INDEXING_ONLY:`
Reduce ranking scores for short chunks without actual information (#4098) * remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering 2025-03-13 10:35:45 -07:00			`logger.notice(`
			`"The intent model should run on the model server. The information content model should not run here."`
			`)`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`warm_up_intent_model()`
			`else:`
Reduce ranking scores for short chunks without actual information (#4098) * remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering 2025-03-13 10:35:45 -07:00			`logger.notice(`
			`"The content information model should run on the indexing model server. The intent model should not run here."`
			`)`
			`warm_up_information_content_model()`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`yield`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00
			`def get_model_app() -> FastAPI:`
			`application = FastAPI(`
welcome to onyx 2024-12-13 09:48:43 -08:00			`title="Onyx Model Server", version=__version__, lifespan=lifespan`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`)`
add sentry (#2786) * add sentry * nit * nit * add requirement to ee * try to ensure sentry is installed in integration tests 2024-10-17 16:20:37 -07:00			`if SENTRY_DSN:`
			`sentry_sdk.init(`
			`dsn=SENTRY_DSN,`
			`integrations=[StarletteIntegration(), FastApiIntegration()],`
feat: sentry updates (#2929) 2024-10-26 12:06:46 -07:00			`traces_sample_rate=0.1,`
add sentry (#2786) * add sentry * nit * nit * add requirement to ee * try to ensure sentry is installed in integration tests 2024-10-17 16:20:37 -07:00			`)`
			`logger.info("Sentry initialized")`
			`else:`
			`logger.debug("Sentry DSN not provided, skipping Sentry initialization")`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00
Healthcheck for model server (#1350) 2024-04-18 16:22:38 -07:00			`application.include_router(management_router)`
Always Use Model Server (#1306) 2024-04-07 21:25:06 -07:00			`application.include_router(encoders_router)`
			`application.include_router(custom_models_router)`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00
			`return application`


			`app = get_model_app()`


			`if __name__ == "__main__":`
Logging Level Update (#2165) 2024-08-18 21:53:40 -07:00			`logger.notice(`
welcome to onyx 2024-12-13 09:48:43 -08:00			`f"Starting Onyx Model Server on http://{MODEL_SERVER_ALLOWED_HOST}:{str(MODEL_SERVER_PORT)}/"`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00			`)`
Logging Level Update (#2165) 2024-08-18 21:53:40 -07:00			`logger.notice(f"Model Server Version: {__version__}")`
Model Server (#695) Provides the ability to pull out the NLP models into a separate model server which can then be hosted on a GPU instance if desired. 2023-11-06 16:36:09 -08:00			`uvicorn.run(app, host=MODEL_SERVER_ALLOWED_HOST, port=MODEL_SERVER_PORT)`