tool to generate vespa schema variations for our cloud (#4556)

* tool to generate vespa schema variations for our cloud * extraneous assign * float, not double * back to double --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-05-30 17:50:27 +02:00 · 2025-04-18 13:47:17 -07:00 · 2025-04-18 13:47:17 -07:00 · e5e0944049
commit e5e0944049
parent 356336a842
3 changed files with 68 additions and 16 deletions
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -111,16 +111,16 @@ schema DANSWER_CHUNK_NAME {
            indexing: summary | attribute
        }
        field primary_owners type array<string> {
-            indexing : summary | attribute
+            indexing: summary | attribute
        }
        field secondary_owners type array<string> {
-            indexing : summary | attribute
+            indexing: summary | attribute
        }
        field access_control_list type weightedset<string> {
            indexing: summary | attribute
            rank: filter
            attribute: fast-search
-        } 
+        }
        field document_sets type weightedset<string> {
            indexing: summary | attribute
            rank: filter
@ -149,7 +149,7 @@ schema DANSWER_CHUNK_NAME {

    rank-profile default_rank {
        inputs {
-            query(decay_factor) float
+            query(decay_factor) double
        }

        function inline document_boost() {
@ -318,10 +318,4 @@ schema DANSWER_CHUNK_NAME {
            expression: bm25(content) + (5 * bm25(title))
        }
    }
-
-    rank-profile random_ {
-        first-phase {
-            expression: random.match
-        }
-    }
 }
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -133,6 +133,13 @@ def _replace_template_values_in_schema(
    )


+def _replace_tenant_template_value_in_schema(
+    schema_template: str,
+    tenant_field: str,
+) -> str:
+    return schema_template.replace(TENANT_ID_PAT, tenant_field)
+
+
 def add_ngrams_to_schema(schema_content: str) -> str:
    # Add the match blocks containing gram and gram-size to title and content fields
    schema_content = re.sub(
@ -242,17 +249,15 @@ class VespaIndex(DocumentIndex):

        with open(schema_file, "r") as schema_f:
            schema_template = schema_f.read()
-        schema_template = schema_template.replace(TENANT_ID_PAT, "")
-
+        schema = _replace_tenant_template_value_in_schema(schema_template, "")
        schema = _replace_template_values_in_schema(
-            schema_template,
+            schema,
            self.index_name,
            primary_embedding_dim,
            primary_embedding_precision,
        )

        schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
-        schema = schema.replace(TENANT_ID_PAT, "")
        zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")

        if self.secondary_index_name:
@ -352,9 +357,14 @@ class VespaIndex(DocumentIndex):
            schema = _replace_template_values_in_schema(
                schema_template, index_name, embedding_dim, embedding_precision
            )
-            schema = schema.replace(
-                TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
+
+            tenant_id_replacement = ""
+            if MULTI_TENANT:
+                tenant_id_replacement = TENANT_ID_REPLACEMENT
+            schema = _replace_tenant_template_value_in_schema(
+                schema, tenant_id_replacement
            )
+
            schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
            zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")

--- a/backend/scripts/debugging/onyx_vespa_schemas.py
+++ b/backend/scripts/debugging/onyx_vespa_schemas.py
@ -0,0 +1,48 @@
+"""Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""
+
+import argparse
+
+from onyx.db.enums import EmbeddingPrecision
+from onyx.document_index.vespa.index import _replace_template_values_in_schema
+from onyx.document_index.vespa.index import _replace_tenant_template_value_in_schema
+from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS
+
+logger = setup_logger()
+
+
+def write_schema(index_name: str, dim: int, template: str) -> None:
+    index_filename = index_name + ".sd"
+    index_rendered_str = _replace_tenant_template_value_in_schema(
+        template, TENANT_ID_REPLACEMENT
+    )
+    index_rendered_str = _replace_template_values_in_schema(
+        index_rendered_str, index_name, dim, EmbeddingPrecision.FLOAT
+    )
+
+    with open(index_filename, "w", encoding="utf-8") as f:
+        f.write(index_rendered_str)
+
+    logger.info(f"Wrote {index_filename}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate multi tenant Vespa schemas")
+    parser.add_argument("--template", help="The schema template to use", required=True)
+    args = parser.parse_args()
+
+    with open(args.template, "r", encoding="utf-8") as f:
+        template_str = f.read()
+
+    num_indexes = 0
+    for model in SUPPORTED_EMBEDDING_MODELS:
+        write_schema(model.index_name, model.dim, template_str)
+        write_schema(model.index_name + "__danswer_alt_index", model.dim, template_str)
+        num_indexes += 2
+
+    logger.info(f"Wrote {num_indexes} indexes.")
+
+
+if __name__ == "__main__":
+    main()