Changed default local model to nomic (#1943)

2025-09-19 20:24:32 +02:00 · 2024-07-31 18:54:02 -07:00
parent 1654378850
commit 1be1959d80
12 changed files with 78 additions and 28 deletions
--- a/web/src/app/admin/models/embedding/components/CustomModelForm.tsx
+++ b/web/src/app/admin/models/embedding/components/CustomModelForm.tsx
@@ -44,7 +44,7 @@ export function CustomModelForm({
              name="model_name"
              label="Name:"
              subtext="The name of the model on Hugging Face"
-              placeholder="E.g. 'intfloat/e5-base-v2'"
+              placeholder="E.g. 'nomic-ai/nomic-embed-text-v1'"
              autoCompleteDisabled={true}
            />

--- a/web/src/app/admin/models/embedding/components/types.ts
+++ b/web/src/app/admin/models/embedding/components/types.ts
@@ -67,12 +67,22 @@ export interface CloudEmbeddingProviderFull extends CloudEmbeddingProvider {

 export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
  {
-    model_name: "intfloat/e5-base-v2",
+    model_name: "nomic-ai/nomic-embed-text-v1",
    model_dim: 768,
    normalize: true,
    description:
      "The recommended default for most situations. If you aren't sure which model to use, this is probably the one.",
    isDefault: true,
+    link: "https://huggingface.co/nomic-ai/nomic-embed-text-v1",
+    query_prefix: "search_query: ",
+    passage_prefix: "search_document: ",
+  },
+  {
+    model_name: "intfloat/e5-base-v2",
+    model_dim: 768,
+    normalize: true,
+    description:
+      "A smaller and faster model than the default. It is around 2x faster than the default model at the cost of lower search quality.",
    link: "https://huggingface.co/intfloat/e5-base-v2",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -82,7 +92,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 384,
    normalize: true,
    description:
-      "A smaller / faster version of the default model. If you're running Danswer on a resource constrained system, then this is a good choice.",
+      "The smallest and fastest version of the E5 line of models. If you're running Danswer on a resource constrained system, then this may be a good choice.",
    link: "https://huggingface.co/intfloat/e5-small-v2",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -92,7 +102,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 768,
    normalize: true,
    description:
-      "If you have many documents in other languages besides English, this is the one to go for.",
+      "For corpora in other languages besides English, this is the one to choose.",
    link: "https://huggingface.co/intfloat/multilingual-e5-base",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
@@ -102,7 +112,7 @@ export const AVAILABLE_MODELS: HostedEmbeddingModel[] = [
    model_dim: 384,
    normalize: true,
    description:
-      "If you have many documents in other languages besides English, and you're running on a resource constrained system, then this is the one to go for.",
+      "For corpora in other languages besides English, as well as being on a resource constrained system, this is the one to choose.",
    link: "https://huggingface.co/intfloat/multilingual-e5-base",
    query_prefix: "query: ",
    passage_prefix: "passage: ",
--- a/web/src/app/admin/models/embedding/page.tsx
+++ b/web/src/app/admin/models/embedding/page.tsx
@@ -265,8 +265,8 @@ function Main() {
  return (
    <div className="h-screen">
      <Text>
-        Embedding models are used to generate embeddings for your documents,
-        which then power Danswer&apos;s search.
+        These deep learning models are used to generate vector representations
+        of your documents, which then power Danswer&apos;s search.
      </Text>

      {alreadySelectedModel && (
@@ -359,12 +359,12 @@ function Main() {
        <>
          <Title className="mt-8">Switch your Embedding Model</Title>
          <Text className="mb-4">
-            If the current model is not working for you, you can update your
-            model choice below. Note that this will require a complete
-            re-indexing of all your documents across every connected source. We
-            will take care of this in the background, but depending on the size
-            of your corpus, this could take hours, day, or even weeks. You can
-            monitor the progress of the re-indexing on this page.
+            Note that updating the backing model will require a complete
+            re-indexing of all documents across every connected source. This is
+            taken care of in the background so that the system can continue to
+            be used, but depending on the size of the corpus, this could take
+            hours or days. You can monitor the progress of the re-indexing on
+            this page while the models are being switched.
          </Text>

          <div className="mt-8 text-sm mr-auto mb-12 divide-x-2 flex">