diff --git a/backend/Dockerfile b/backend/Dockerfile index 7f9daad94..17e0be8c2 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -68,7 +68,9 @@ RUN apt-get update && \ rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key # Pre-downloading models for setups with limited egress -RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('intfloat/e5-base-v2')" +RUN python -c "from tokenizers import Tokenizer; \ +Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')" + # Pre-downloading NLTK for setups with limited egress RUN python -c "import nltk; \ diff --git a/backend/Dockerfile.model_server b/backend/Dockerfile.model_server index 89f24e2ac..21efebca2 100644 --- a/backend/Dockerfile.model_server +++ b/backend/Dockerfile.model_server @@ -18,14 +18,17 @@ RUN apt-get remove -y --allow-remove-essential perl-base && \ apt-get autoremove -y # Pre-downloading models for setups with limited egress -RUN python -c "from transformers import AutoModel, AutoTokenizer, TFDistilBertForSequenceClassification; \ +RUN python -c "from transformers import AutoTokenizer; \ +AutoTokenizer.from_pretrained('danswer/intent-model', cache_folder='/root/.cache/temp_huggingface/hub/'); \ +AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_folder='/root/.cache/temp_huggingface/hub/'); \ +from transformers import TFDistilBertForSequenceClassification; \ +TFDistilBertForSequenceClassification.from_pretrained('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \ from huggingface_hub import snapshot_download; \ -AutoTokenizer.from_pretrained('danswer/intent-model'); \ -AutoTokenizer.from_pretrained('intfloat/e5-base-v2'); \ -AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \ -snapshot_download('danswer/intent-model'); \ -snapshot_download('intfloat/e5-base-v2'); \ -snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1')" +snapshot_download('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \ +snapshot_download('nomic-ai/nomic-embed-text-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \ +snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \ +from sentence_transformers import SentenceTransformer; \ +SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True, cache_folder='/root/.cache/temp_huggingface/hub/');" WORKDIR /app diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py index b4c0e8cf2..0619757b8 100644 --- a/backend/danswer/configs/model_configs.py +++ b/backend/danswer/configs/model_configs.py @@ -12,7 +12,7 @@ import os # The useable models configured as below must be SentenceTransformer compatible # NOTE: DO NOT CHANGE SET THESE UNLESS YOU KNOW WHAT YOU ARE DOING # IDEALLY, YOU SHOULD CHANGE EMBEDDING MODELS VIA THE UI -DEFAULT_DOCUMENT_ENCODER_MODEL = "intfloat/e5-base-v2" +DEFAULT_DOCUMENT_ENCODER_MODEL = "nomic-ai/nomic-embed-text-v1" DOCUMENT_ENCODER_MODEL = ( os.environ.get("DOCUMENT_ENCODER_MODEL") or DEFAULT_DOCUMENT_ENCODER_MODEL ) @@ -34,8 +34,8 @@ OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS = False SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0) SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0) # Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs) -ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "query: ") -ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "passage: ") +ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "search_query: ") +ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "search_document: ") # Purely an optimization, memory limitation consideration BATCH_SIZE_ENCODE_CHUNKS = 8 # For score display purposes, only way is to know the expected ranges diff --git a/backend/danswer/natural_language_processing/utils.py b/backend/danswer/natural_language_processing/utils.py index beef56833..30726033f 100644 --- a/backend/danswer/natural_language_processing/utils.py +++ b/backend/danswer/natural_language_processing/utils.py @@ -116,8 +116,9 @@ def get_tokenizer(model_name: str | None, provider_type: str | None) -> BaseToke if provider_type.lower() == "openai": # Used across ada and text-embedding-3 models return _check_tokenizer_cache("openai") + # If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name + # this means we are approximating the token count which may leave some performance on the table - # If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name if not model_name: raise ValueError("Need to provide a model_name or provider_type") diff --git a/backend/model_server/custom_models.py b/backend/model_server/custom_models.py index ee97ded78..c4aaf6ffa 100644 --- a/backend/model_server/custom_models.py +++ b/backend/model_server/custom_models.py @@ -6,6 +6,7 @@ from fastapi import APIRouter from transformers import AutoTokenizer # type: ignore from transformers import TFDistilBertForSequenceClassification +from danswer.utils.logger import setup_logger from model_server.constants import MODEL_WARM_UP_STRING from model_server.utils import simple_log_function_time from shared_configs.configs import INDEXING_ONLY @@ -14,6 +15,7 @@ from shared_configs.configs import INTENT_MODEL_VERSION from shared_configs.model_server_models import IntentRequest from shared_configs.model_server_models import IntentResponse +logger = setup_logger() router = APIRouter(prefix="/custom") @@ -23,7 +25,7 @@ _INTENT_MODEL: Optional[TFDistilBertForSequenceClassification] = None def get_intent_model_tokenizer( model_name: str = INTENT_MODEL_VERSION, -) -> "AutoTokenizer": +) -> AutoTokenizer: global _INTENT_TOKENIZER if _INTENT_TOKENIZER is None: _INTENT_TOKENIZER = AutoTokenizer.from_pretrained(model_name) @@ -44,6 +46,7 @@ def get_local_intent_model( def warm_up_intent_model() -> None: + logger.info(f"Warming up Intent Model: {INTENT_MODEL_VERSION}") intent_tokenizer = get_intent_model_tokenizer() inputs = intent_tokenizer( MODEL_WARM_UP_STRING, return_tensors="tf", truncation=True, padding=True diff --git a/backend/model_server/encoders.py b/backend/model_server/encoders.py index fca065280..350d4c222 100644 --- a/backend/model_server/encoders.py +++ b/backend/model_server/encoders.py @@ -199,7 +199,14 @@ def get_embedding_model( if model_name not in _GLOBAL_MODELS_DICT: logger.info(f"Loading {model_name}") - model = SentenceTransformer(model_name) + # Some model architectures that aren't built into the Transformers or Sentence + # Transformer need to be downloaded to be loaded locally. This does not mean + # data is sent to remote servers for inference, however the remote code can + # be fairly arbitrary so only use trusted models + model = SentenceTransformer( + model_name_or_path=model_name, + trust_remote_code=True, + ) model.max_seq_length = max_context_length _GLOBAL_MODELS_DICT[model_name] = model elif max_context_length != _GLOBAL_MODELS_DICT[model_name].max_seq_length: diff --git a/backend/model_server/main.py b/backend/model_server/main.py index 1aaf95678..87059d634 100644 --- a/backend/model_server/main.py +++ b/backend/model_server/main.py @@ -1,6 +1,9 @@ +import asyncio import os +import shutil from collections.abc import AsyncGenerator from contextlib import asynccontextmanager +from pathlib import Path import torch import uvicorn @@ -29,6 +32,24 @@ transformer_logging.set_verbosity_error() logger = setup_logger() +async def manage_huggingface_cache() -> None: + temp_hf_cache = Path("/root/.cache/temp_huggingface") + hf_cache = Path("/root/.cache/huggingface") + if temp_hf_cache.is_dir() and any(temp_hf_cache.iterdir()): + hf_cache.mkdir(parents=True, exist_ok=True) + for item in temp_hf_cache.iterdir(): + if item.is_dir(): + await asyncio.to_thread( + shutil.copytree, item, hf_cache / item.name, dirs_exist_ok=True + ) + else: + await asyncio.to_thread(shutil.copy2, item, hf_cache) + await asyncio.to_thread(shutil.rmtree, temp_hf_cache) + logger.info("Copied contents of temp_huggingface and deleted the directory.") + else: + logger.info("Source directory is empty or does not exist. Skipping copy.") + + @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator: if torch.cuda.is_available(): @@ -36,6 +57,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: else: logger.info("GPU is not available") + await manage_huggingface_cache() + torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads())) logger.info(f"Torch Threads: {torch.get_num_threads()}") diff --git a/backend/requirements/model_server.txt b/backend/requirements/model_server.txt index 19cbfcc62..9a291c224 100644 --- a/backend/requirements/model_server.txt +++ b/backend/requirements/model_server.txt @@ -1,3 +1,4 @@ +einops==0.8.0 fastapi==0.109.2 h5py==3.9.0 pydantic==1.10.13 diff --git a/deployment/docker_compose/README.md b/deployment/docker_compose/README.md index a12f22bea..a5f650b53 100644 --- a/deployment/docker_compose/README.md +++ b/deployment/docker_compose/README.md @@ -8,7 +8,7 @@ For general information, please read the instructions in this [README](https://g This part is elaborated precisely in in this [README](https://github.com/danswer-ai/danswer/blob/main/deployment/README.md) in section *Docker Compose*. If you have any questions, please feel free to open an issue or get in touch in slack for support. ## Deploy in a system with GPU support -Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. Default embedding models `intfloat/e5-base-v2` takes up about 1GB of VRAM and since we need this for inference and embedding pipeline, you would need roughly 2GB of VRAM. +Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. For example, the embedding model `nomic-ai/nomic-embed-text-v1` takes up about 1GB of VRAM. That means running this model for inference and embedding pipeline would require roughly 2GB of VRAM. ### Setup To be able to use NVIDIA runtime, following is mandatory: diff --git a/web/src/app/admin/models/embedding/components/CustomModelForm.tsx b/web/src/app/admin/models/embedding/components/CustomModelForm.tsx index 78e2129eb..ab8a4a2f3 100644 --- a/web/src/app/admin/models/embedding/components/CustomModelForm.tsx +++ b/web/src/app/admin/models/embedding/components/CustomModelForm.tsx @@ -44,7 +44,7 @@ export function CustomModelForm({ name="model_name" label="Name:" subtext="The name of the model on Hugging Face" - placeholder="E.g. 'intfloat/e5-base-v2'" + placeholder="E.g. 'nomic-ai/nomic-embed-text-v1'" autoCompleteDisabled={true} /> diff --git a/web/src/app/admin/models/embedding/components/types.ts b/web/src/app/admin/models/embedding/components/types.ts index 5258f69ab..3a97ee400 100644 --- a/web/src/app/admin/models/embedding/components/types.ts +++ b/web/src/app/admin/models/embedding/components/types.ts @@ -67,12 +67,22 @@ export interface CloudEmbeddingProviderFull extends CloudEmbeddingProvider { export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [ { - model_name: "intfloat/e5-base-v2", + model_name: "nomic-ai/nomic-embed-text-v1", model_dim: 768, normalize: true, description: "The recommended default for most situations. If you aren't sure which model to use, this is probably the one.", isDefault: true, + link: "https://huggingface.co/nomic-ai/nomic-embed-text-v1", + query_prefix: "search_query: ", + passage_prefix: "search_document: ", + }, + { + model_name: "intfloat/e5-base-v2", + model_dim: 768, + normalize: true, + description: + "A smaller and faster model than the default. It is around 2x faster than the default model at the cost of lower search quality.", link: "https://huggingface.co/intfloat/e5-base-v2", query_prefix: "query: ", passage_prefix: "passage: ", @@ -82,7 +92,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [ model_dim: 384, normalize: true, description: - "A smaller / faster version of the default model. If you're running Danswer on a resource constrained system, then this is a good choice.", + "The smallest and fastest version of the E5 line of models. If you're running Danswer on a resource constrained system, then this may be a good choice.", link: "https://huggingface.co/intfloat/e5-small-v2", query_prefix: "query: ", passage_prefix: "passage: ", @@ -92,7 +102,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [ model_dim: 768, normalize: true, description: - "If you have many documents in other languages besides English, this is the one to go for.", + "For corpora in other languages besides English, this is the one to choose.", link: "https://huggingface.co/intfloat/multilingual-e5-base", query_prefix: "query: ", passage_prefix: "passage: ", @@ -102,7 +112,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [ model_dim: 384, normalize: true, description: - "If you have many documents in other languages besides English, and you're running on a resource constrained system, then this is the one to go for.", + "For corpora in other languages besides English, as well as being on a resource constrained system, this is the one to choose.", link: "https://huggingface.co/intfloat/multilingual-e5-base", query_prefix: "query: ", passage_prefix: "passage: ", diff --git a/web/src/app/admin/models/embedding/page.tsx b/web/src/app/admin/models/embedding/page.tsx index f36144806..fc7cd9228 100644 --- a/web/src/app/admin/models/embedding/page.tsx +++ b/web/src/app/admin/models/embedding/page.tsx @@ -265,8 +265,8 @@ function Main() { return (
- Embedding models are used to generate embeddings for your documents, - which then power Danswer's search. + These deep learning models are used to generate vector representations + of your documents, which then power Danswer's search. {alreadySelectedModel && ( @@ -359,12 +359,12 @@ function Main() { <> Switch your Embedding Model - If the current model is not working for you, you can update your - model choice below. Note that this will require a complete - re-indexing of all your documents across every connected source. We - will take care of this in the background, but depending on the size - of your corpus, this could take hours, day, or even weeks. You can - monitor the progress of the re-indexing on this page. + Note that updating the backing model will require a complete + re-indexing of all documents across every connected source. This is + taken care of in the background so that the system can continue to + be used, but depending on the size of the corpus, this could take + hours or days. You can monitor the progress of the re-indexing on + this page while the models are being switched.