Changed default local model to nomic (#1943)

This commit is contained in:
hagen-danswer 2024-07-31 18:54:02 -07:00 committed by GitHub
parent 1654378850
commit 1be1959d80
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 78 additions and 28 deletions

View File

@ -68,7 +68,9 @@ RUN apt-get update && \
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key
# Pre-downloading models for setups with limited egress
RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('intfloat/e5-base-v2')"
RUN python -c "from tokenizers import Tokenizer; \
Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
# Pre-downloading NLTK for setups with limited egress
RUN python -c "import nltk; \

View File

@ -18,14 +18,17 @@ RUN apt-get remove -y --allow-remove-essential perl-base && \
apt-get autoremove -y
# Pre-downloading models for setups with limited egress
RUN python -c "from transformers import AutoModel, AutoTokenizer, TFDistilBertForSequenceClassification; \
RUN python -c "from transformers import AutoTokenizer; \
AutoTokenizer.from_pretrained('danswer/intent-model', cache_folder='/root/.cache/temp_huggingface/hub/'); \
AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_folder='/root/.cache/temp_huggingface/hub/'); \
from transformers import TFDistilBertForSequenceClassification; \
TFDistilBertForSequenceClassification.from_pretrained('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \
from huggingface_hub import snapshot_download; \
AutoTokenizer.from_pretrained('danswer/intent-model'); \
AutoTokenizer.from_pretrained('intfloat/e5-base-v2'); \
AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
snapshot_download('danswer/intent-model'); \
snapshot_download('intfloat/e5-base-v2'); \
snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1')"
snapshot_download('danswer/intent-model', cache_dir='/root/.cache/temp_huggingface/hub/'); \
snapshot_download('nomic-ai/nomic-embed-text-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
from sentence_transformers import SentenceTransformer; \
SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True, cache_folder='/root/.cache/temp_huggingface/hub/');"
WORKDIR /app

View File

@ -12,7 +12,7 @@ import os
# The useable models configured as below must be SentenceTransformer compatible
# NOTE: DO NOT CHANGE SET THESE UNLESS YOU KNOW WHAT YOU ARE DOING
# IDEALLY, YOU SHOULD CHANGE EMBEDDING MODELS VIA THE UI
DEFAULT_DOCUMENT_ENCODER_MODEL = "intfloat/e5-base-v2"
DEFAULT_DOCUMENT_ENCODER_MODEL = "nomic-ai/nomic-embed-text-v1"
DOCUMENT_ENCODER_MODEL = (
os.environ.get("DOCUMENT_ENCODER_MODEL") or DEFAULT_DOCUMENT_ENCODER_MODEL
)
@ -34,8 +34,8 @@ OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS = False
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
# Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs)
ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "query: ")
ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "passage: ")
ASYM_QUERY_PREFIX = os.environ.get("ASYM_QUERY_PREFIX", "search_query: ")
ASYM_PASSAGE_PREFIX = os.environ.get("ASYM_PASSAGE_PREFIX", "search_document: ")
# Purely an optimization, memory limitation consideration
BATCH_SIZE_ENCODE_CHUNKS = 8
# For score display purposes, only way is to know the expected ranges

View File

@ -116,8 +116,9 @@ def get_tokenizer(model_name: str | None, provider_type: str | None) -> BaseToke
if provider_type.lower() == "openai":
# Used across ada and text-embedding-3 models
return _check_tokenizer_cache("openai")
# If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name
# this means we are approximating the token count which may leave some performance on the table
# If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name
if not model_name:
raise ValueError("Need to provide a model_name or provider_type")

View File

@ -6,6 +6,7 @@ from fastapi import APIRouter
from transformers import AutoTokenizer # type: ignore
from transformers import TFDistilBertForSequenceClassification
from danswer.utils.logger import setup_logger
from model_server.constants import MODEL_WARM_UP_STRING
from model_server.utils import simple_log_function_time
from shared_configs.configs import INDEXING_ONLY
@ -14,6 +15,7 @@ from shared_configs.configs import INTENT_MODEL_VERSION
from shared_configs.model_server_models import IntentRequest
from shared_configs.model_server_models import IntentResponse
logger = setup_logger()
router = APIRouter(prefix="/custom")
@ -23,7 +25,7 @@ _INTENT_MODEL: Optional[TFDistilBertForSequenceClassification] = None
def get_intent_model_tokenizer(
model_name: str = INTENT_MODEL_VERSION,
) -> "AutoTokenizer":
) -> AutoTokenizer:
global _INTENT_TOKENIZER
if _INTENT_TOKENIZER is None:
_INTENT_TOKENIZER = AutoTokenizer.from_pretrained(model_name)
@ -44,6 +46,7 @@ def get_local_intent_model(
def warm_up_intent_model() -> None:
logger.info(f"Warming up Intent Model: {INTENT_MODEL_VERSION}")
intent_tokenizer = get_intent_model_tokenizer()
inputs = intent_tokenizer(
MODEL_WARM_UP_STRING, return_tensors="tf", truncation=True, padding=True

View File

@ -199,7 +199,14 @@ def get_embedding_model(
if model_name not in _GLOBAL_MODELS_DICT:
logger.info(f"Loading {model_name}")
model = SentenceTransformer(model_name)
# Some model architectures that aren't built into the Transformers or Sentence
# Transformer need to be downloaded to be loaded locally. This does not mean
# data is sent to remote servers for inference, however the remote code can
# be fairly arbitrary so only use trusted models
model = SentenceTransformer(
model_name_or_path=model_name,
trust_remote_code=True,
)
model.max_seq_length = max_context_length
_GLOBAL_MODELS_DICT[model_name] = model
elif max_context_length != _GLOBAL_MODELS_DICT[model_name].max_seq_length:

View File

@ -1,6 +1,9 @@
import asyncio
import os
import shutil
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from pathlib import Path
import torch
import uvicorn
@ -29,6 +32,24 @@ transformer_logging.set_verbosity_error()
logger = setup_logger()
async def manage_huggingface_cache() -> None:
temp_hf_cache = Path("/root/.cache/temp_huggingface")
hf_cache = Path("/root/.cache/huggingface")
if temp_hf_cache.is_dir() and any(temp_hf_cache.iterdir()):
hf_cache.mkdir(parents=True, exist_ok=True)
for item in temp_hf_cache.iterdir():
if item.is_dir():
await asyncio.to_thread(
shutil.copytree, item, hf_cache / item.name, dirs_exist_ok=True
)
else:
await asyncio.to_thread(shutil.copy2, item, hf_cache)
await asyncio.to_thread(shutil.rmtree, temp_hf_cache)
logger.info("Copied contents of temp_huggingface and deleted the directory.")
else:
logger.info("Source directory is empty or does not exist. Skipping copy.")
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator:
if torch.cuda.is_available():
@ -36,6 +57,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
else:
logger.info("GPU is not available")
await manage_huggingface_cache()
torch.set_num_threads(max(MIN_THREADS_ML_MODELS, torch.get_num_threads()))
logger.info(f"Torch Threads: {torch.get_num_threads()}")

View File

@ -1,3 +1,4 @@
einops==0.8.0
fastapi==0.109.2
h5py==3.9.0
pydantic==1.10.13

View File

@ -8,7 +8,7 @@ For general information, please read the instructions in this [README](https://g
This part is elaborated precisely in in this [README](https://github.com/danswer-ai/danswer/blob/main/deployment/README.md) in section *Docker Compose*. If you have any questions, please feel free to open an issue or get in touch in slack for support.
## Deploy in a system with GPU support
Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. Default embedding models `intfloat/e5-base-v2` takes up about 1GB of VRAM and since we need this for inference and embedding pipeline, you would need roughly 2GB of VRAM.
Running Model servers with GPU support while indexing and querying can result in significant improvements in performance. This is highly recommended if you have access to resources. Currently, Danswer offloads embedding model and tokenizers to the GPU VRAM and the size needed depends on chosen embedding model. For example, the embedding model `nomic-ai/nomic-embed-text-v1` takes up about 1GB of VRAM. That means running this model for inference and embedding pipeline would require roughly 2GB of VRAM.
### Setup
To be able to use NVIDIA runtime, following is mandatory:

View File

@ -44,7 +44,7 @@ export function CustomModelForm({
name="model_name"
label="Name:"
subtext="The name of the model on Hugging Face"
placeholder="E.g. 'intfloat/e5-base-v2'"
placeholder="E.g. 'nomic-ai/nomic-embed-text-v1'"
autoCompleteDisabled={true}
/>

View File

@ -67,12 +67,22 @@ export interface CloudEmbeddingProviderFull extends CloudEmbeddingProvider {
export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
{
model_name: "intfloat/e5-base-v2",
model_name: "nomic-ai/nomic-embed-text-v1",
model_dim: 768,
normalize: true,
description:
"The recommended default for most situations. If you aren't sure which model to use, this is probably the one.",
isDefault: true,
link: "https://huggingface.co/nomic-ai/nomic-embed-text-v1",
query_prefix: "search_query: ",
passage_prefix: "search_document: ",
},
{
model_name: "intfloat/e5-base-v2",
model_dim: 768,
normalize: true,
description:
"A smaller and faster model than the default. It is around 2x faster than the default model at the cost of lower search quality.",
link: "https://huggingface.co/intfloat/e5-base-v2",
query_prefix: "query: ",
passage_prefix: "passage: ",
@ -82,7 +92,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
model_dim: 384,
normalize: true,
description:
"A smaller / faster version of the default model. If you're running Danswer on a resource constrained system, then this is a good choice.",
"The smallest and fastest version of the E5 line of models. If you're running Danswer on a resource constrained system, then this may be a good choice.",
link: "https://huggingface.co/intfloat/e5-small-v2",
query_prefix: "query: ",
passage_prefix: "passage: ",
@ -92,7 +102,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
model_dim: 768,
normalize: true,
description:
"If you have many documents in other languages besides English, this is the one to go for.",
"For corpora in other languages besides English, this is the one to choose.",
link: "https://huggingface.co/intfloat/multilingual-e5-base",
query_prefix: "query: ",
passage_prefix: "passage: ",
@ -102,7 +112,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
model_dim: 384,
normalize: true,
description:
"If you have many documents in other languages besides English, and you're running on a resource constrained system, then this is the one to go for.",
"For corpora in other languages besides English, as well as being on a resource constrained system, this is the one to choose.",
link: "https://huggingface.co/intfloat/multilingual-e5-base",
query_prefix: "query: ",
passage_prefix: "passage: ",

View File

@ -265,8 +265,8 @@ function Main() {
return (
<div className="h-screen">
<Text>
Embedding models are used to generate embeddings for your documents,
which then power Danswer&apos;s search.
These deep learning models are used to generate vector representations
of your documents, which then power Danswer&apos;s search.
</Text>
{alreadySelectedModel && (
@ -359,12 +359,12 @@ function Main() {
<>
<Title className="mt-8">Switch your Embedding Model</Title>
<Text className="mb-4">
If the current model is not working for you, you can update your
model choice below. Note that this will require a complete
re-indexing of all your documents across every connected source. We
will take care of this in the background, but depending on the size
of your corpus, this could take hours, day, or even weeks. You can
monitor the progress of the re-indexing on this page.
Note that updating the backing model will require a complete
re-indexing of all documents across every connected source. This is
taken care of in the background so that the system can continue to
be used, but depending on the size of the corpus, this could take
hours or days. You can monitor the progress of the re-indexing on
this page while the models are being switched.
</Text>
<div className="mt-8 text-sm mr-auto mb-12 divide-x-2 flex">