From 7d78aaed3a2a68943fd60fb17367178a4c599d19 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Mon, 1 Jul 2024 15:53:40 -0700
Subject: [PATCH] checkpoint

---
 backend/danswer/configs/app_configs.py        |   7 +
 .../document_index/opensearch/__init__.py     |   0
 .../document_index/opensearch/utils.py        | 164 ++++++++++++++++++
 backend/requirements/default.txt              |   1 +
 4 files changed, 172 insertions(+)
 create mode 100644 backend/danswer/document_index/opensearch/__init__.py
 create mode 100644 backend/danswer/document_index/opensearch/utils.py

diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
index c40c661b4..eb423973f 100644
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -112,6 +112,13 @@ VESPA_TENANT_PORT = os.environ.get("VESPA_TENANT_PORT") or "19071"
 VESPA_DEPLOYMENT_ZIP = (
     os.environ.get("VESPA_DEPLOYMENT_ZIP") or "/app/danswer/vespa-app.zip"
 )
+
+# Opensearch
+OPENSEARCH_HOST = os.environ.get("OPENSEARCH_HOST") or "localhost"
+OPENSEARCH_PORT = os.environ.get("OPENSEARCH_PORT") or "9200"
+OPENSEARCH_USER = os.environ.get("OPENSEARCH_USER") or "admin"
+OPENSEARCH_PASSWORD = os.environ.get("OPENSEARCH_PASSWORD") or "D@nswer_1ndex"
+
 # Number of documents in a batch during indexing (further batching done by chunks before passing to bi-encoder)
 try:
     INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE", 16))
diff --git a/backend/danswer/document_index/opensearch/__init__.py b/backend/danswer/document_index/opensearch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/document_index/opensearch/utils.py b/backend/danswer/document_index/opensearch/utils.py
new file mode 100644
index 000000000..dce7900c9
--- /dev/null
+++ b/backend/danswer/document_index/opensearch/utils.py
@@ -0,0 +1,164 @@
+from typing import Any
+
+from opensearchpy import OpenSearch
+
+from danswer.configs.app_configs import OPENSEARCH_HOST
+from danswer.configs.app_configs import OPENSEARCH_PASSWORD
+from danswer.configs.app_configs import OPENSEARCH_PORT
+from danswer.configs.app_configs import OPENSEARCH_USER
+
+
+def get_opensearch_client(
+    host: str = OPENSEARCH_HOST,
+    port: str = OPENSEARCH_PORT,
+    user: str = OPENSEARCH_USER,
+    password: str = OPENSEARCH_PASSWORD,
+) -> OpenSearch:
+    opensearch_client = OpenSearch(
+        hosts=[{"host": host, "port": port}],
+        http_auth=(user, password),
+        use_ssl=True,
+        verify_certs=False,
+        ssl_show_warn=False,
+    )
+    return opensearch_client
+
+
+#####
+# Schema Utils
+#####
+def get_schema_settings(shards: int = 1) -> dict[str, Any]:
+    schema_settings = {"index": {"number_of_shards": shards, "knn": True}}
+    return schema_settings
+
+
+def get_knn_settings(
+    embedding_dim: int, ef_construction: int = 200, m: int = 48
+) -> dict[str, Any]:
+    # TODO explore hyperparameters
+    knn_settings = {
+        "type": "knn_vector",
+        "dimension": embedding_dim,
+        "method": {
+            "name": "hnsw",
+            "space_type": "cosinesimil",
+            "engine": "nmslib",
+            "parameters": {"ef_construction": ef_construction, "m": m},
+        },
+    }
+    return knn_settings
+
+
+def get_chunk_properties(embedding_dim: int) -> dict[str, Any]:
+    chunk_properties = {
+        "type": "nested",
+        "properties": {
+            # In Opensearch/Elasticsearch, fields are nullable by default
+            "link": {"type": "text", "index": False},
+            # Max number of tokens in the chunk, set as a limit for which level of granularity this chunk represents
+            "max_num_tokens": {"type": "integer", "index": False},
+            # Actual number of tokens in the chunk as determined by the tokenizer
+            # This is to prevent repeat counting
+            "num_tokens": {"type": "integer", "index": False},
+            # Index of the chunk in the document at the given granularity
+            # For each granularity (that exists for this doc), there will be an index 0 chunk
+            "chunk_index": {"type": "integer", "index": False},
+            "content": {"type": "text"},
+            "embedding": get_knn_settings(embedding_dim=embedding_dim),
+        },
+    }
+    return chunk_properties
+
+
+def get_flat_dict_properties() -> dict[str, Any]:
+    flat_dict_properties = {
+        "type": "nested",
+        "properties": {
+            # Keywords are used for exact match, it can be a single term or an "array"
+            # it's not actually an array but it acts the same
+            "key": {"type": "keyword"},
+            # To have a list of values, it simply needs to be added to the index multiple times
+            # with the same key and different values
+            "value": {"type": "keyword"},
+        },
+    }
+    return flat_dict_properties
+
+
+def get_danswer_opensearch_schema(embedding_dim: int) -> dict[str, Any]:
+    full_schema = {
+        "settings": get_schema_settings(),
+        "mappings": {
+            "properties": {
+                "document_id": {"type": "keyword"},
+                "semantic_identifier": {"type": "text", "index": False},
+                "title": {"type": "text"},
+                # This is not used for search, all search keywords are stored at the chunk level
+                "content": {"type": "text", "index": False},
+                "title_vector": get_knn_settings(embedding_dim=embedding_dim),
+                "chunks": get_chunk_properties(embedding_dim=embedding_dim),
+                "source_type": {"type": "keyword"},
+                "document_sets": {"type": "keyword"},
+                "access_control_list": {"type": "keyword"},
+                "metadata": get_flat_dict_properties(),
+                "primary_owners": {"type": "keyword"},
+                "secondary_owners": {"type": "keyword"},
+                "last_updated": {"type": "date"},
+                "boost_count": {"type": "integer", "null_value": 0},
+                # Inverted for efficient filtering
+                "not_hidden": {"type": "boolean", "null_value": True},
+            }
+        },
+    }
+    return full_schema
+
+
+#####
+# Query Utils
+#####
+
+
+def get_normalization_search_pipeline_settings(
+    keyword_weighting: float = 0.4,
+    title_vector_boost_weighting: float = 0.1,
+    chunk_vector_weighting: float = 0.5,
+) -> dict[str, Any]:
+    # TODO: Explore hyperparameters
+    # Note: The expectation is that the Keyword component encompases both the Title and the Chunk texts
+    # additionally that the Title field is upweighted already by the time it hits this step
+    # The title is also expected to be included in the chunk text for the vectorizing so the extra title
+    # boost is ADDITIONAL
+    pipeline_settings = {
+        "description": "Normalization for keyword and vector scores",
+        "phase_results_processors": [
+            {
+                "normalization-processor": {
+                    "normalization": {"technique": "min_max"},
+                    "combination": {
+                        "technique": "arithmetic_mean",
+                        "parameters": {
+                            "weights": [
+                                keyword_weighting,
+                                title_vector_boost_weighting,
+                                chunk_vector_weighting,
+                            ]
+                        },
+                    },
+                }
+            }
+        ],
+    }
+    return pipeline_settings
+
+
+def get_query_base(max_num_results: int) -> dict[str, Any]:
+    query_base = {
+        "size": max_num_results,
+        "query": {"bool": {"must": [], "filter": []}},
+    }
+    return query_base
+
+
+def get_not_hidden_filter() -> dict[str, Any]:
+    not_hidden_filter = {"term": {"not_hidden": True}}
+    return not_hidden_filter
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
index 76f4387ae..9df8dad47 100644
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -38,6 +38,7 @@ Office365-REST-Python-Client==2.5.9
 oauthlib==3.2.2
 openai==1.14.3 
 openpyxl==3.1.2
+opensearch-py==2.6.0
 playwright==1.41.2
 psutil==5.9.5
 psycopg2-binary==2.9.9