Changed default local model to nomic (#1943)

2025-09-25 11:16:43 +02:00 · 2024-07-31 18:54:02 -07:00
parent 1654378850
commit 1be1959d80
12 changed files with 78 additions and 28 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -68,7 +68,9 @@ RUN apt-get update && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

 # Pre-downloading models for setups with limited egress
-RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('intfloat/e5-base-v2')"
+RUN python -c "from tokenizers import Tokenizer; \
+Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
+

 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
--- a/backend/Dockerfile.model_server
+++ b/backend/Dockerfile.model_server
@@ -18,14 +18,17 @@ RUN apt-get remove -y --allow-remove-essential perl-base && \
    apt-get autoremove -y

 # Pre-downloading models for setups with limited egress
-RUN python -c "from transformers import AutoModel, AutoTokenizer, TFDistilBertForSequenceClassification; \
+RUN python -c "from transformers import AutoTokenizer; \
+AutoTokenizer.from_pretrained('danswer/intent-model', cache_folder='/root/.cache/temp_huggingface/hub/'); \
+AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_folder='/root/.cache/temp_huggingface/hub/'); \
+from transformers import TFDistilBertForSequenceClassification; \
+TFDistilBertForSequenceClassification.from_pretrained('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \
 from huggingface_hub import snapshot_download; \
-AutoTokenizer.from_pretrained('danswer/intent-model'); \
-AutoTokenizer.from_pretrained('intfloat/e5-base-v2'); \
-AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
-snapshot_download('danswer/intent-model'); \
-snapshot_download('intfloat/e5-base-v2'); \
-snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1')"
+snapshot_download('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \
+snapshot_download('nomic-ai/nomic-embed-text-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
+snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
+from sentence_transformers import SentenceTransformer; \
+SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True, cache_folder='/root/.cache/temp_huggingface/hub/');"

 WORKDIR /app

--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@@ -12,7 +12,7 @@ import os
 # The useable models configured as below must be SentenceTransformer compatible
 # NOTE: DO NOT CHANGE SET THESE UNLESS YOU KNOW WHAT YOU ARE DOING
 # IDEALLY, YOU SHOULD CHANGE EMBEDDING MODELS VIA THE UI
-DEFAULT_DOCUMENT_ENCODER_MODEL = "intfloat/e5-base-v2"
+DEFAULT_DOCUMENT_ENCODER_MODEL = "nomic-ai/nomic-embed-text-v1"
 DOCUMENT_ENCODER_MODEL = (
    os.environ.get("DOCUMENT_ENCODER_MODEL") or DEFAULT_DOCUMENT_ENCODER_MODEL
 )
@@ -34,8 +34,8 @@ OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS = False
 SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
 SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
 # Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs)
-ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "query: ")
-ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "passage: ")
+ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "search_query: ")
+ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "search_document: ")
 # Purely an optimization, memory limitation consideration
 BATCH_SIZE_ENCODE_CHUNKS = 8
 # For score display purposes, only way is to know the expected ranges
--- a/backend/danswer/natural_language_processing/utils.py
+++ b/backend/danswer/natural_language_processing/utils.py
@@ -116,8 +116,9 @@ def get_tokenizer(model_name: str | None, provider_type: str | None) -> BaseToke
        if provider_type.lower() == "openai":
            # Used across ada and text-embedding-3 models
            return _check_tokenizer_cache("openai")
+        # If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name
+        # this means we are approximating the token count which may leave some performance on the table

-    # If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name
    if not model_name:
        raise ValueError("Need to provide a model_name or provider_type")

--- a/backend/model_server/custom_models.py
+++ b/backend/model_server/custom_models.py
@@ -6,6 +6,7 @@ from fastapi import APIRouter
 from transformers import AutoTokenizer  # type: ignore
 from transformers import TFDistilBertForSequenceClassification

+from danswer.utils.logger import setup_logger
 from model_server.constants import MODEL_WARM_UP_STRING
 from model_server.utils import simple_log_function_time
 from shared_configs.configs import INDEXING_ONLY
@@ -14,6 +15,7 @@ from shared_configs.configs import INTENT_MODEL_VERSION
 from shared_configs.model_server_models import IntentRequest
 from shared_configs.model_server_models import IntentResponse

+logger = setup_logger()

 router = APIRouter(prefix="/custom")

@@ -23,7 +25,7 @@ _INTENT_MODEL: Optional[TFDistilBertForSequenceClassification] = None

 def get_intent_model_tokenizer(
    model_name: str = INTENT_MODEL_VERSION,
-) -> "AutoTokenizer":
+) -> AutoTokenizer:
    global _INTENT_TOKENIZER
    if _INTENT_TOKENIZER is None:
        _INTENT_TOKENIZER = AutoTokenizer.from_pretrained(model_name)
@@ -44,6 +46,7 @@ def get_local_intent_model(


 def warm_up_intent_model() -> None:
+    logger.info(f"Warming up Intent Model: {INTENT_MODEL_VERSION}")
    intent_tokenizer = get_intent_model_tokenizer()
    inputs = intent_tokenizer(
        MODEL_WARM_UP_STRING, return_tensors="tf", truncation=True, padding=True
--- a/backend/model_server/encoders.py
+++ b/backend/model_server/encoders.py
@@ -199,7 +199,14 @@ def get_embedding_model(

    if model_name not in _GLOBAL_MODELS_DICT:
        logger.info(f"Loading {model_name}")
-        model = SentenceTransformer(model_name)
+        # Some model architectures that aren't built into the Transformers or Sentence
+        # Transformer need to be downloaded to be loaded locally. This does not mean
+        # data is sent to remote servers for inference, however the remote code can
+        # be fairly arbitrary so only use trusted models
+        model = SentenceTransformer(
+            model_name_or_path=model_name,
+            trust_remote_code=True,
+        )
        model.max_seq_length = max_context_length
        _GLOBAL_MODELS_DICT[model_name] = model
    elif max_context_length != _GLOBAL_MODELS_DICT[model_name].max_seq_length:
--- a/backend/model_server/main.py
+++ b/backend/model_server/main.py
@@ -1,6 +1,9 @@
+import asyncio
 import os
+import shutil
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
+from pathlib import Path

 import torch
 import uvicorn
@@ -29,6 +32,24 @@ transformer_logging.set_verbosity_error()
 logger = setup_logger()


+async def manage_huggingface_cache() -> None:
+    temp_hf_cache = Path("/root/.cache/temp_huggingface")
+    hf_cache = Path("/root/.cache/huggingface")
+    if temp_hf_cache.is_dir() and any(temp_hf_cache.iterdir()):
+        hf_cache.mkdir(parents=True, exist_ok=True)
+        for item in temp_hf_cache.iterdir():
+            if item.is_dir():
+                await asyncio.to_thread(
+                    shutil.copytree, item, hf_cache / item.name, dirs_exist_ok=True
+                )
+            else:
+                await asyncio.to_thread(shutil.copy2, item, hf_cache)
+        await asyncio.to_thread(shutil.rmtree, temp_hf_cache)
+        logger.info("Copied contents of temp_huggingface and deleted the directory.")
+    else:
+        logger.info("Source directory is empty or does not exist. Skipping copy.")
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator:
    if torch.cuda.is_available():
@@ -36,6 +57,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
    else:
        logger.info("GPU is not available")

+    await manage_huggingface_cache()
+
    torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads()))
    logger.info(f"Torch Threads: {torch.get_num_threads()}")

--- a/backend/requirements/model_server.txt
+++ b/backend/requirements/model_server.txt
@@ -1,3 +1,4 @@
+einops==0.8.0
 fastapi==0.109.2
 h5py==3.9.0
 pydantic==1.10.13
--- a/deployment/docker_compose/README.md
+++ b/deployment/docker_compose/README.md
@@ -8,7 +8,7 @@ For general information, please read the instructions in this [README](https://g
 This part is elaborated precisely in  in this [README](https://github.com/danswer-ai/danswer/blob/main/deployment/README.md) in section *Docker Compose*. If you have any questions, please feel free to open an issue or get in touch in slack for support.

 ## Deploy in a system with GPU support
-Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. Default embedding models `intfloat/e5-base-v2` takes up about 1GB of VRAM and since we need this for inference and embedding pipeline, you would need roughly 2GB of VRAM.
+Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. For example, the embedding model `nomic-ai/nomic-embed-text-v1` takes up about 1GB of VRAM. That means running this model for inference and embedding pipeline would require roughly 2GB of VRAM.

 ### Setup
 To be able to use NVIDIA runtime, following is mandatory:
--- a/web/src/app/admin/models/embedding/components/CustomModelForm.tsx
+++ b/web/src/app/admin/models/embedding/components/CustomModelForm.tsx
@@ -44,7 +44,7 @@ export function CustomModelForm({
              name="model_name"
              label="Name:"
              subtext="The name of the model on Hugging Face"
-              placeholder="E.g. 'intfloat/e5-base-v2'"
+              placeholder="E.g. 'nomic-ai/nomic-embed-text-v1'"
              autoCompleteDisabled={true}
            />

--- a/web/src/app/admin/models/embedding/components/types.ts
+++ b/web/src/app/admin/models/embedding/components/types.ts
@@ -67,12 +67,22 @@ export interface CloudEmbeddingProviderFull extends CloudEmbeddingProvider {

 export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
  {
-    model_name: "intfloat/e5-base-v2",
+    model_name: "nomic-ai/nomic-embed-text-v1",
    model_dim: 768,
    normalize: true,
    description:
      "The recommended default for most situations. If you aren't sure which model to use, this is probably the one.",
    isDefault: true,
+    link: "https://huggingface.co/nomic-ai/nomic-embed-text-v1",
+    query_prefix: "search_query: ",
+    passage_prefix: "search_document: ",
+  },
+  {
+    model_name: "intfloat/e5-base-v2",
+    model_dim: 768,
+    normalize: true,
+    description:
+      "A smaller and faster model than the default. It is around 2x faster than the default model at the cost of lower search quality.",
    link: "https://huggingface.co/intfloat/e5-base-v2",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -82,7 +92,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 384,
    normalize: true,
    description:
-      "A smaller / faster version of the default model. If you're running Danswer on a resource constrained system, then this is a good choice.",
+      "The smallest and fastest version of the E5 line of models. If you're running Danswer on a resource constrained system, then this may be a good choice.",
    link: "https://huggingface.co/intfloat/e5-small-v2",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -92,7 +102,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 768,
    normalize: true,
    description:
-      "If you have many documents in other languages besides English, this is the one to go for.",
+      "For corpora in other languages besides English, this is the one to choose.",
    link: "https://huggingface.co/intfloat/multilingual-e5-base",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -102,7 +112,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 384,
    normalize: true,
    description:
-      "If you have many documents in other languages besides English, and you're running on a resource constrained system, then this is the one to go for.",
+      "For corpora in other languages besides English, as well as being on a resource constrained system, this is the one to choose.",
    link: "https://huggingface.co/intfloat/multilingual-e5-base",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
--- a/web/src/app/admin/models/embedding/page.tsx
+++ b/web/src/app/admin/models/embedding/page.tsx
@@ -265,8 +265,8 @@ function Main() {
  return (
    <div className="h-screen">
      <Text>
-        Embedding models are used to generate embeddings for your documents,
-        which then power Danswer&apos;s search.
+        These deep learning models are used to generate vector representations
+        of your documents, which then power Danswer&apos;s search.
      </Text>

      {alreadySelectedModel && (
@@ -359,12 +359,12 @@ function Main() {
        <>
          <Title className="mt-8">Switch your Embedding Model</Title>
          <Text className="mb-4">
-            If the current model is not working for you, you can update your
-            model choice below. Note that this will require a complete
-            re-indexing of all your documents across every connected source. We
-            will take care of this in the background, but depending on the size
-            of your corpus, this could take hours, day, or even weeks. You can
-            monitor the progress of the re-indexing on this page.
+            Note that updating the backing model will require a complete
+            re-indexing of all documents across every connected source. This is
+            taken care of in the background so that the system can continue to
+            be used, but depending on the size of the corpus, this could take
+            hours or days. You can monitor the progress of the re-indexing on
+            this page while the models are being switched.
          </Text>

          <div className="mt-8 text-sm mr-auto mb-12 divide-x-2 flex">