From 7d78aaed3a2a68943fd60fb17367178a4c599d19 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 1 Jul 2024 15:53:40 -0700 Subject: [PATCH] checkpoint --- backend/danswer/configs/app_configs.py | 7 + .../document_index/opensearch/__init__.py | 0 .../document_index/opensearch/utils.py | 164 ++++++++++++++++++ backend/requirements/default.txt | 1 + 4 files changed, 172 insertions(+) create mode 100644 backend/danswer/document_index/opensearch/__init__.py create mode 100644 backend/danswer/document_index/opensearch/utils.py diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index c40c661b4..eb423973f 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -112,6 +112,13 @@ VESPA_TENANT_PORT = os.environ.get("VESPA_TENANT_PORT") or "19071" VESPA_DEPLOYMENT_ZIP = ( os.environ.get("VESPA_DEPLOYMENT_ZIP") or "/app/danswer/vespa-app.zip" ) + +# Opensearch +OPENSEARCH_HOST = os.environ.get("OPENSEARCH_HOST") or "localhost" +OPENSEARCH_PORT = os.environ.get("OPENSEARCH_PORT") or "9200" +OPENSEARCH_USER = os.environ.get("OPENSEARCH_USER") or "admin" +OPENSEARCH_PASSWORD = os.environ.get("OPENSEARCH_PASSWORD") or "D@nswer_1ndex" + # Number of documents in a batch during indexing (further batching done by chunks before passing to bi-encoder) try: INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE", 16)) diff --git a/backend/danswer/document_index/opensearch/__init__.py b/backend/danswer/document_index/opensearch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/document_index/opensearch/utils.py b/backend/danswer/document_index/opensearch/utils.py new file mode 100644 index 000000000..dce7900c9 --- /dev/null +++ b/backend/danswer/document_index/opensearch/utils.py @@ -0,0 +1,164 @@ +from typing import Any + +from opensearchpy import OpenSearch + +from danswer.configs.app_configs import OPENSEARCH_HOST +from danswer.configs.app_configs import OPENSEARCH_PASSWORD +from danswer.configs.app_configs import OPENSEARCH_PORT +from danswer.configs.app_configs import OPENSEARCH_USER + + +def get_opensearch_client( + host: str = OPENSEARCH_HOST, + port: str = OPENSEARCH_PORT, + user: str = OPENSEARCH_USER, + password: str = OPENSEARCH_PASSWORD, +) -> OpenSearch: + opensearch_client = OpenSearch( + hosts=[{"host": host, "port": port}], + http_auth=(user, password), + use_ssl=True, + verify_certs=False, + ssl_show_warn=False, + ) + return opensearch_client + + +##### +# Schema Utils +##### +def get_schema_settings(shards: int = 1) -> dict[str, Any]: + schema_settings = {"index": {"number_of_shards": shards, "knn": True}} + return schema_settings + + +def get_knn_settings( + embedding_dim: int, ef_construction: int = 200, m: int = 48 +) -> dict[str, Any]: + # TODO explore hyperparameters + knn_settings = { + "type": "knn_vector", + "dimension": embedding_dim, + "method": { + "name": "hnsw", + "space_type": "cosinesimil", + "engine": "nmslib", + "parameters": {"ef_construction": ef_construction, "m": m}, + }, + } + return knn_settings + + +def get_chunk_properties(embedding_dim: int) -> dict[str, Any]: + chunk_properties = { + "type": "nested", + "properties": { + # In Opensearch/Elasticsearch, fields are nullable by default + "link": {"type": "text", "index": False}, + # Max number of tokens in the chunk, set as a limit for which level of granularity this chunk represents + "max_num_tokens": {"type": "integer", "index": False}, + # Actual number of tokens in the chunk as determined by the tokenizer + # This is to prevent repeat counting + "num_tokens": {"type": "integer", "index": False}, + # Index of the chunk in the document at the given granularity + # For each granularity (that exists for this doc), there will be an index 0 chunk + "chunk_index": {"type": "integer", "index": False}, + "content": {"type": "text"}, + "embedding": get_knn_settings(embedding_dim=embedding_dim), + }, + } + return chunk_properties + + +def get_flat_dict_properties() -> dict[str, Any]: + flat_dict_properties = { + "type": "nested", + "properties": { + # Keywords are used for exact match, it can be a single term or an "array" + # it's not actually an array but it acts the same + "key": {"type": "keyword"}, + # To have a list of values, it simply needs to be added to the index multiple times + # with the same key and different values + "value": {"type": "keyword"}, + }, + } + return flat_dict_properties + + +def get_danswer_opensearch_schema(embedding_dim: int) -> dict[str, Any]: + full_schema = { + "settings": get_schema_settings(), + "mappings": { + "properties": { + "document_id": {"type": "keyword"}, + "semantic_identifier": {"type": "text", "index": False}, + "title": {"type": "text"}, + # This is not used for search, all search keywords are stored at the chunk level + "content": {"type": "text", "index": False}, + "title_vector": get_knn_settings(embedding_dim=embedding_dim), + "chunks": get_chunk_properties(embedding_dim=embedding_dim), + "source_type": {"type": "keyword"}, + "document_sets": {"type": "keyword"}, + "access_control_list": {"type": "keyword"}, + "metadata": get_flat_dict_properties(), + "primary_owners": {"type": "keyword"}, + "secondary_owners": {"type": "keyword"}, + "last_updated": {"type": "date"}, + "boost_count": {"type": "integer", "null_value": 0}, + # Inverted for efficient filtering + "not_hidden": {"type": "boolean", "null_value": True}, + } + }, + } + return full_schema + + +##### +# Query Utils +##### + + +def get_normalization_search_pipeline_settings( + keyword_weighting: float = 0.4, + title_vector_boost_weighting: float = 0.1, + chunk_vector_weighting: float = 0.5, +) -> dict[str, Any]: + # TODO: Explore hyperparameters + # Note: The expectation is that the Keyword component encompases both the Title and the Chunk texts + # additionally that the Title field is upweighted already by the time it hits this step + # The title is also expected to be included in the chunk text for the vectorizing so the extra title + # boost is ADDITIONAL + pipeline_settings = { + "description": "Normalization for keyword and vector scores", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": {"technique": "min_max"}, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + keyword_weighting, + title_vector_boost_weighting, + chunk_vector_weighting, + ] + }, + }, + } + } + ], + } + return pipeline_settings + + +def get_query_base(max_num_results: int) -> dict[str, Any]: + query_base = { + "size": max_num_results, + "query": {"bool": {"must": [], "filter": []}}, + } + return query_base + + +def get_not_hidden_filter() -> dict[str, Any]: + not_hidden_filter = {"term": {"not_hidden": True}} + return not_hidden_filter diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 76f4387ae..9df8dad47 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -38,6 +38,7 @@ Office365-REST-Python-Client==2.5.9 oauthlib==3.2.2 openai==1.14.3 openpyxl==3.1.2 +opensearch-py==2.6.0 playwright==1.41.2 psutil==5.9.5 psycopg2-binary==2.9.9