2024-04-07 21:25:06 -07:00
|
|
|
import os
|
2024-07-31 18:54:02 -07:00
|
|
|
import shutil
|
2024-04-07 21:25:06 -07:00
|
|
|
from collections.abc import AsyncGenerator
|
|
|
|
from contextlib import asynccontextmanager
|
2024-07-31 18:54:02 -07:00
|
|
|
from pathlib import Path
|
2024-04-07 21:25:06 -07:00
|
|
|
|
2024-10-17 16:20:37 -07:00
|
|
|
import sentry_sdk
|
2023-11-06 16:36:09 -08:00
|
|
|
import torch
|
|
|
|
import uvicorn
|
|
|
|
from fastapi import FastAPI
|
2024-10-17 16:20:37 -07:00
|
|
|
from sentry_sdk.integrations.fastapi import FastApiIntegration
|
|
|
|
from sentry_sdk.integrations.starlette import StarletteIntegration
|
2024-04-07 21:25:06 -07:00
|
|
|
from transformers import logging as transformer_logging # type:ignore
|
2023-11-06 16:36:09 -08:00
|
|
|
|
|
|
|
from model_server.custom_models import router as custom_models_router
|
2025-03-13 10:35:45 -07:00
|
|
|
from model_server.custom_models import warm_up_information_content_model
|
2023-11-06 16:36:09 -08:00
|
|
|
from model_server.custom_models import warm_up_intent_model
|
|
|
|
from model_server.encoders import router as encoders_router
|
2024-04-18 16:22:38 -07:00
|
|
|
from model_server.management_endpoints import router as management_router
|
2025-02-07 16:59:02 -08:00
|
|
|
from model_server.utils import get_gpu_type
|
2024-12-13 09:48:43 -08:00
|
|
|
from onyx import __version__
|
|
|
|
from onyx.utils.logger import setup_logger
|
2024-04-10 23:13:22 -07:00
|
|
|
from shared_configs.configs import INDEXING_ONLY
|
|
|
|
from shared_configs.configs import MIN_THREADS_ML_MODELS
|
|
|
|
from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
|
|
|
|
from shared_configs.configs import MODEL_SERVER_PORT
|
2024-10-17 16:20:37 -07:00
|
|
|
from shared_configs.configs import SENTRY_DSN
|
2024-04-07 21:25:06 -07:00
|
|
|
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
2023-11-06 16:36:09 -08:00
|
|
|
|
2024-12-13 20:29:02 -06:00
|
|
|
HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/huggingface"
|
|
|
|
TEMP_HF_CACHE_PATH = Path(os.path.expanduser("~")) / ".cache/temp_huggingface"
|
2024-08-12 17:45:05 -07:00
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
transformer_logging.set_verbosity_error()
|
2023-11-06 16:36:09 -08:00
|
|
|
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
|
|
|
2024-08-12 17:45:05 -07:00
|
|
|
def _move_files_recursively(source: Path, dest: Path, overwrite: bool = False) -> None:
|
|
|
|
"""
|
|
|
|
This moves the files from the temp huggingface cache to the huggingface cache
|
|
|
|
|
|
|
|
We have to move each file individually because the directories might
|
|
|
|
have the same name but not the same contents and we dont want to remove
|
|
|
|
the files in the existing huggingface cache that don't exist in the temp
|
|
|
|
huggingface cache.
|
|
|
|
"""
|
2024-12-29 18:34:23 -05:00
|
|
|
|
2024-08-12 17:45:05 -07:00
|
|
|
for item in source.iterdir():
|
|
|
|
target_path = dest / item.relative_to(source)
|
|
|
|
if item.is_dir():
|
|
|
|
_move_files_recursively(item, target_path, overwrite)
|
|
|
|
else:
|
|
|
|
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if target_path.exists() and not overwrite:
|
|
|
|
continue
|
|
|
|
shutil.move(str(item), str(target_path))
|
2024-07-31 18:54:02 -07:00
|
|
|
|
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
@asynccontextmanager
|
|
|
|
async def lifespan(app: FastAPI) -> AsyncGenerator:
|
2025-02-07 16:59:02 -08:00
|
|
|
gpu_type = get_gpu_type()
|
2025-02-07 17:28:17 -08:00
|
|
|
logger.notice(f"Torch GPU Detection: gpu_type={gpu_type}")
|
2025-02-07 16:59:02 -08:00
|
|
|
|
|
|
|
app.state.gpu_type = gpu_type
|
2023-11-06 16:36:09 -08:00
|
|
|
|
2024-08-12 17:45:05 -07:00
|
|
|
if TEMP_HF_CACHE_PATH.is_dir():
|
2024-08-18 21:53:40 -07:00
|
|
|
logger.notice("Moving contents of temp_huggingface to huggingface cache.")
|
2024-08-12 17:45:05 -07:00
|
|
|
_move_files_recursively(TEMP_HF_CACHE_PATH, HF_CACHE_PATH)
|
|
|
|
shutil.rmtree(TEMP_HF_CACHE_PATH, ignore_errors=True)
|
2024-08-18 21:53:40 -07:00
|
|
|
logger.notice("Moved contents of temp_huggingface to huggingface cache.")
|
2024-07-31 18:54:02 -07:00
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads()))
|
2024-08-18 21:53:40 -07:00
|
|
|
logger.notice(f"Torch Threads: {torch.get_num_threads()}")
|
2023-11-06 16:36:09 -08:00
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
if not INDEXING_ONLY:
|
2025-03-13 10:35:45 -07:00
|
|
|
logger.notice(
|
|
|
|
"The intent model should run on the model server. The information content model should not run here."
|
|
|
|
)
|
2024-04-07 21:25:06 -07:00
|
|
|
warm_up_intent_model()
|
|
|
|
else:
|
2025-03-13 10:35:45 -07:00
|
|
|
logger.notice(
|
|
|
|
"The content information model should run on the indexing model server. The intent model should not run here."
|
|
|
|
)
|
|
|
|
warm_up_information_content_model()
|
2023-11-06 16:36:09 -08:00
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
yield
|
2023-11-06 16:36:09 -08:00
|
|
|
|
2024-04-07 21:25:06 -07:00
|
|
|
|
|
|
|
def get_model_app() -> FastAPI:
|
|
|
|
application = FastAPI(
|
2024-12-13 09:48:43 -08:00
|
|
|
title="Onyx Model Server", version=__version__, lifespan=lifespan
|
2024-04-07 21:25:06 -07:00
|
|
|
)
|
2024-10-17 16:20:37 -07:00
|
|
|
if SENTRY_DSN:
|
|
|
|
sentry_sdk.init(
|
|
|
|
dsn=SENTRY_DSN,
|
|
|
|
integrations=[StarletteIntegration(), FastApiIntegration()],
|
2024-10-26 12:06:46 -07:00
|
|
|
traces_sample_rate=0.1,
|
2024-10-17 16:20:37 -07:00
|
|
|
)
|
|
|
|
logger.info("Sentry initialized")
|
|
|
|
else:
|
|
|
|
logger.debug("Sentry DSN not provided, skipping Sentry initialization")
|
2024-04-07 21:25:06 -07:00
|
|
|
|
2024-04-18 16:22:38 -07:00
|
|
|
application.include_router(management_router)
|
2024-04-07 21:25:06 -07:00
|
|
|
application.include_router(encoders_router)
|
|
|
|
application.include_router(custom_models_router)
|
2023-11-06 16:36:09 -08:00
|
|
|
|
|
|
|
return application
|
|
|
|
|
|
|
|
|
|
|
|
app = get_model_app()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-08-18 21:53:40 -07:00
|
|
|
logger.notice(
|
2024-12-13 09:48:43 -08:00
|
|
|
f"Starting Onyx Model Server on http://{MODEL_SERVER_ALLOWED_HOST}:{str(MODEL_SERVER_PORT)}/"
|
2023-11-06 16:36:09 -08:00
|
|
|
)
|
2024-08-18 21:53:40 -07:00
|
|
|
logger.notice(f"Model Server Version: {__version__}")
|
2023-11-06 16:36:09 -08:00
|
|
|
uvicorn.run(app, host=MODEL_SERVER_ALLOWED_HOST, port=MODEL_SERVER_PORT)
|