tool to generate vespa schema variations for our cloud (#4556)

* tool to generate vespa schema variations for our cloud

* extraneous assign

* float, not double

* back to double

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-04-18 13:47:17 -07:00 committed by GitHub
parent 356336a842
commit e5e0944049
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 68 additions and 16 deletions

View File

@ -111,16 +111,16 @@ schema DANSWER_CHUNK_NAME {
indexing: summary | attribute
}
field primary_owners type array<string> {
indexing : summary | attribute
indexing: summary | attribute
}
field secondary_owners type array<string> {
indexing : summary | attribute
indexing: summary | attribute
}
field access_control_list type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
field document_sets type weightedset<string> {
indexing: summary | attribute
rank: filter
@ -149,7 +149,7 @@ schema DANSWER_CHUNK_NAME {
rank-profile default_rank {
inputs {
query(decay_factor) float
query(decay_factor) double
}
function inline document_boost() {
@ -318,10 +318,4 @@ schema DANSWER_CHUNK_NAME {
expression: bm25(content) + (5 * bm25(title))
}
}
rank-profile random_ {
first-phase {
expression: random.match
}
}
}

View File

@ -133,6 +133,13 @@ def _replace_template_values_in_schema(
)
def _replace_tenant_template_value_in_schema(
schema_template: str,
tenant_field: str,
) -> str:
return schema_template.replace(TENANT_ID_PAT, tenant_field)
def add_ngrams_to_schema(schema_content: str) -> str:
# Add the match blocks containing gram and gram-size to title and content fields
schema_content = re.sub(
@ -242,17 +249,15 @@ class VespaIndex(DocumentIndex):
with open(schema_file, "r") as schema_f:
schema_template = schema_f.read()
schema_template = schema_template.replace(TENANT_ID_PAT, "")
schema = _replace_tenant_template_value_in_schema(schema_template, "")
schema = _replace_template_values_in_schema(
schema_template,
schema,
self.index_name,
primary_embedding_dim,
primary_embedding_precision,
)
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
schema = schema.replace(TENANT_ID_PAT, "")
zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")
if self.secondary_index_name:
@ -352,9 +357,14 @@ class VespaIndex(DocumentIndex):
schema = _replace_template_values_in_schema(
schema_template, index_name, embedding_dim, embedding_precision
)
schema = schema.replace(
TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
tenant_id_replacement = ""
if MULTI_TENANT:
tenant_id_replacement = TENANT_ID_REPLACEMENT
schema = _replace_tenant_template_value_in_schema(
schema, tenant_id_replacement
)
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")

View File

@ -0,0 +1,48 @@
"""Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""
import argparse
from onyx.db.enums import EmbeddingPrecision
from onyx.document_index.vespa.index import _replace_template_values_in_schema
from onyx.document_index.vespa.index import _replace_tenant_template_value_in_schema
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
from onyx.utils.logger import setup_logger
from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS
logger = setup_logger()
def write_schema(index_name: str, dim: int, template: str) -> None:
index_filename = index_name + ".sd"
index_rendered_str = _replace_tenant_template_value_in_schema(
template, TENANT_ID_REPLACEMENT
)
index_rendered_str = _replace_template_values_in_schema(
index_rendered_str, index_name, dim, EmbeddingPrecision.FLOAT
)
with open(index_filename, "w", encoding="utf-8") as f:
f.write(index_rendered_str)
logger.info(f"Wrote {index_filename}")
def main() -> None:
parser = argparse.ArgumentParser(description="Generate multi tenant Vespa schemas")
parser.add_argument("--template", help="The schema template to use", required=True)
args = parser.parse_args()
with open(args.template, "r", encoding="utf-8") as f:
template_str = f.read()
num_indexes = 0
for model in SUPPORTED_EMBEDDING_MODELS:
write_schema(model.index_name, model.dim, template_str)
write_schema(model.index_name + "__danswer_alt_index", model.dim, template_str)
num_indexes += 2
logger.info(f"Wrote {num_indexes} indexes.")
if __name__ == "__main__":
main()