DAN-81 Improve search round 2 (#82)

Includes: - Multi vector indexing/search - Ensemble model reranking - Keyword Search backend
2025-09-27 12:29:41 +02:00 · 2023-06-04 20:02:32 -07:00
parent 7cc64efc3a
commit c4e8afe4d2
35 changed files with 1223 additions and 863 deletions
--- a/backend/scripts/reset_indexes.py
+++ b/backend/scripts/reset_indexes.py
@@ -0,0 +1,39 @@
+# This file is purely for development use, not included in any builds
+from danswer.configs.model_configs import DOC_EMBEDDING_DIM
+from danswer.datastores.typesense.store import create_typesense_collection
+from danswer.utils.clients import get_qdrant_client
+from danswer.utils.clients import get_typesense_client
+from danswer.utils.logging import setup_logger
+from qdrant_client.http.models import Distance
+from qdrant_client.http.models import VectorParams
+from typesense.exceptions import ObjectNotFound  # type: ignore
+
+logger = setup_logger()
+
+
+def recreate_qdrant_collection(
+    collection_name: str, embedding_dim: int = DOC_EMBEDDING_DIM
+) -> None:
+    logger.info(f"Attempting to recreate Qdrant collection {collection_name}")
+    result = get_qdrant_client().recreate_collection(
+        collection_name=collection_name,
+        vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
+    )
+    if not result:
+        raise RuntimeError("Could not create Qdrant collection")
+
+
+def recreate_typesense_collection(collection_name: str) -> None:
+    logger.info(f"Attempting to recreate Typesense collection {collection_name}")
+    ts_client = get_typesense_client()
+    try:
+        ts_client.collections[collection_name].delete()
+    except ObjectNotFound:
+        logger.debug(f"Collection {collection_name} does not already exist")
+
+    create_typesense_collection(collection_name)
+
+
+if __name__ == "__main__":
+    recreate_qdrant_collection("danswer_index")
+    recreate_typesense_collection("danswer_index")
--- a/backend/scripts/simulate_frontend.py
+++ b/backend/scripts/simulate_frontend.py
@@ -1,13 +1,12 @@
+# This file is purely for development use, not included in any builds
 import argparse
 import json
 import urllib
+from pprint import pprint

 import requests
 from danswer.configs.app_configs import APP_PORT
 from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
-from danswer.configs.constants import BLURB
-from danswer.configs.constants import SEMANTIC_IDENTIFIER
-from danswer.configs.constants import SOURCE_LINK
 from danswer.configs.constants import SOURCE_TYPE


@@ -16,35 +15,44 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
-        "-k",
-        "--keyword-search",
-        action="store_true",
-        help="Use keyword search if set, semantic search otherwise",
+        "-f",
+        "--flow",
+        type=str,
+        default="QA",
+        help='"Search" or "QA", defaults to "QA"',
    )

    parser.add_argument(
        "-t",
-        "--source-types",
+        "--type",
        type=str,
-        help="Comma separated list of source types to filter by",
+        default="Semantic",
+        help='"Semantic" or "Keyword", defaults to "Semantic"',
    )

    parser.add_argument(
        "-s",
        "--stream",
        action="store_true",
-        help="Enable streaming response",
+        help='Enable streaming response, only for flow="QA"',
+    )
+
+    parser.add_argument(
+        "--filters",
+        type=str,
+        help="Comma separated list of source types to filter by (no spaces)",
    )

    parser.add_argument("query", nargs="*", help="The query to process")

+    previous_input = None
    while True:
        try:
            user_input = input(
                "\n\nAsk any question:\n"
-                "  - prefix with -t to add a filter by source type(s)\n"
-                "  - prefix with -s to stream answer\n"
-                "  - input an empty string to rerun last query\n\t"
+                "  - Use -f (QA/Search) and -t (Semantic/Keyword) flags to set endpoint.\n"
+                "  - prefix with -s to stream answer, --filters web,slack etc. for filters.\n"
+                "  - input an empty string to rerun last query.\n\t"
            )

            if user_input:
@@ -58,62 +66,51 @@ if __name__ == "__main__":

            args = parser.parse_args(user_input.split())

-            keyword_search = args.keyword_search
-            source_types = args.source_types.split(",") if args.source_types else None
+            flow = str(args.flow).lower()
+            flow_type = str(args.type).lower()
+            stream = args.stream
+            source_types = args.filters.split(",") if args.filters else None
            if source_types and len(source_types) == 1:
                source_types = source_types[0]
            query = " ".join(args.query)

-            endpoint = (
-                f"http://127.0.0.1:{APP_PORT}/direct-qa"
-                if not args.stream
-                else f"http://127.0.0.1:{APP_PORT}/stream-direct-qa"
-            )
-            if args.keyword_search:
-                endpoint = f"http://127.0.0.1:{APP_PORT}/keyword-search"
-                raise NotImplementedError("keyword search is not supported for now")
+            if flow not in ["qa", "search"]:
+                raise ValueError("Flow value must be QA or Search")
+            if flow_type not in ["keyword", "semantic"]:
+                raise ValueError("Type value must be keyword or semantic")
+            if flow != "qa" and stream:
+                raise ValueError("Can only stream results for QA")
+
+            if (flow, flow_type) == ("search", "keyword"):
+                path = "keyword-search"
+            elif (flow, flow_type) == ("search", "semantic"):
+                path = "semantic-search"
+            elif stream:
+                path = "stream-direct-qa"
+            else:
+                path = "direct-qa"
+
+            endpoint = f"http://127.0.0.1:{APP_PORT}/{path}"

            query_json = {
                "query": query,
                "collection": QDRANT_DEFAULT_COLLECTION,
-                "filters": [{SOURCE_TYPE: source_types}],
+                "use_keyword": flow_type == "keyword",  # Ignore if not QA Endpoints
+                "filters": json.dumps([{SOURCE_TYPE: source_types}]),
            }
-            if not args.stream:
-                response = requests.get(
-                    endpoint, params=urllib.parse.urlencode(query_json)
-                )
-                contents = json.loads(response.content)
-                if keyword_search:
-                    if contents["results"]:
-                        for link in contents["results"]:
-                            print(link)
-                    else:
-                        print("No matches found")
-                else:
-                    answer = contents.get("answer")
-                    if answer:
-                        print("Answer: " + answer)
-                    else:
-                        print("Answer: ?")
-                    if contents.get("quotes"):
-                        for ind, (quote, quote_info) in enumerate(
-                            contents["quotes"].items()
-                        ):
-                            print(f"Quote {str(ind + 1)}:\n{quote}")
-                            print(
-                                f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}"
-                            )
-                            print(f"Blurb: {quote_info[BLURB]}")
-                            print(f"Link: {quote_info[SOURCE_LINK]}")
-                            print(f"Source: {quote_info[SOURCE_TYPE]}")
-                    else:
-                        print("No quotes found")
-            else:
+
+            if args.stream:
                with requests.get(
                    endpoint, params=urllib.parse.urlencode(query_json), stream=True
                ) as r:
                    for json_response in r.iter_lines():
-                        print(json.loads(json_response.decode()))
+                        pprint(json.loads(json_response.decode()))
+            else:
+                response = requests.get(
+                    endpoint, params=urllib.parse.urlencode(query_json)
+                )
+                contents = json.loads(response.content)
+                pprint(contents)

        except Exception as e:
            print(f"Failed due to {e}, retrying")