From f2d3d8269d50f9d3d749a227a09ffb9a33120cce Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Sat, 29 Apr 2023 15:22:16 -0700
Subject: [PATCH] Prompt Tuning and minor QOA changes (#2)

---
 backend/danswer/direct_qa/qa_prompts.py      |  6 +-
 backend/danswer/direct_qa/semantic_search.py | 25 ++++--
 backend/danswer/utils/text_processing.py     | 17 ++--
 backend/scripts/ingestion.py                 |  4 +-
 backend/scripts/simulate_frontend.py         | 91 +++++++++++---------
 5 files changed, 81 insertions(+), 62 deletions(-)

diff --git a/backend/danswer/direct_qa/qa_prompts.py b/backend/danswer/direct_qa/qa_prompts.py
index 7fbd90898..2a36624b2 100644
--- a/backend/danswer/direct_qa/qa_prompts.py
+++ b/backend/danswer/direct_qa/qa_prompts.py
@@ -7,9 +7,9 @@ QUOTE_PAT = "Quote:"
 
 def generic_prompt_processor(question: str, documents: list[str]) -> str:
     prompt = (
-        f"Answer the query based on the documents below and quote the documents sections containing "
-        f'the answer. Respond with one "{ANSWER_PAT}" section and one or more "{QUOTE_PAT}" sections. '
-        f"For each quote, only include text exactly from the documents, don't include the source. "
+        f"Answer the query based on the documents below and quote the documents segments containing the answer. "
+        f'Respond with one "{ANSWER_PAT}" section and as many "{QUOTE_PAT}" sections as is relevant. '
+        f'Start each quote with "{QUOTE_PAT}". Each quote should be a single continuous segment from a document. '
         f'If the query cannot be answered based on the documents, say "{UNCERTAINTY_PAT}". '
         f'Each document is prefixed with "{DOC_SEP_PAT}".\n\n'
     )
diff --git a/backend/danswer/direct_qa/semantic_search.py b/backend/danswer/direct_qa/semantic_search.py
index f630121dc..689d6a524 100644
--- a/backend/danswer/direct_qa/semantic_search.py
+++ b/backend/danswer/direct_qa/semantic_search.py
@@ -13,6 +13,8 @@ from danswer.configs.model_configs import QUERY_EMBEDDING_CONTEXT_SIZE
 from danswer.utils.clients import get_qdrant_client
 from danswer.utils.logging import setup_logger
 from danswer.utils.timing import build_timing_wrapper
+from qdrant_client.http.exceptions import ResponseHandlingException
+from qdrant_client.http.exceptions import UnexpectedResponse
 from sentence_transformers import CrossEncoder  # type: ignore
 from sentence_transformers import SentenceTransformer  # type: ignore
 
@@ -43,14 +45,21 @@ def semantic_retrival(
         )["data"][0]["embedding"]
     else:
         query_embedding = embedding_model.encode(query)
-    hits = get_qdrant_client().search(
-        collection_name=qdrant_collection,
-        query_vector=query_embedding
-        if isinstance(query_embedding, list)
-        else query_embedding.tolist(),
-        query_filter=None,
-        limit=num_hits,
-    )
+    try:
+        hits = get_qdrant_client().search(
+            collection_name=qdrant_collection,
+            query_vector=query_embedding
+            if isinstance(query_embedding, list)
+            else query_embedding.tolist(),
+            query_filter=None,
+            limit=num_hits,
+        )
+    except ResponseHandlingException as e:
+        logger.exception(f'Qdrant querying failed due to: "{e}", is Qdrant set up?')
+    except UnexpectedResponse as e:
+        logger.exception(
+            f'Qdrant querying failed due to: "{e}", has ingestion been run?'
+        )
 
     retrieved_chunks = []
     for hit in hits:
diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py
index 86ce262dc..685d073ae 100644
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -11,9 +11,14 @@ def clean_model_quote(quote: str, trim_length: int) -> str:
 
 def shared_precompare_cleanup(text: str) -> str:
     text = text.lower()
-    text = "".join(
-        text.split()
-    )  # GPT models like to return cleaner spacing, not good for quote matching
-    return text.replace(
-        "*", ""
-    )  # GPT models sometimes like to cleanup bulletpoints represented by *
+
+    # GPT models like to return cleaner spacing, not good for quote matching
+    text = "".join(text.split())
+
+    # GPT models sometimes like to clean up bulletpoints represented by *
+    text = text.replace("*", "")
+
+    # GPT models sometimes like to edit the quoting, ie "Title: Contents" becomes Title: "Contents"
+    text = text.replace('"', "")
+
+    return text
diff --git a/backend/scripts/ingestion.py b/backend/scripts/ingestion.py
index bbcde17d6..6db5faf80 100644
--- a/backend/scripts/ingestion.py
+++ b/backend/scripts/ingestion.py
@@ -109,6 +109,6 @@ if __name__ == "__main__":
     if args.rebuild_index:
         recreate_collection(args.qdrant_collection)
 
-    #load_slack_batch(args.slack_export_dir, args.qdrant_collection)
+    # load_slack_batch(args.slack_export_dir, args.qdrant_collection)
     load_web_batch(args.website_url, args.qdrant_collection)
-    #load_google_drive_batch(args.qdrant_collection)
+    # load_google_drive_batch(args.qdrant_collection)
diff --git a/backend/scripts/simulate_frontend.py b/backend/scripts/simulate_frontend.py
index 438fadb5b..bc28ae5fb 100644
--- a/backend/scripts/simulate_frontend.py
+++ b/backend/scripts/simulate_frontend.py
@@ -9,50 +9,55 @@ from danswer.configs.constants import SOURCE_TYPE
 if __name__ == "__main__":
     previous_query = None
     while True:
-        keyword_search = False
-        query = input(
-            "\n\nAsk any question:\n  - prefix with -k for keyword search\n  - input an empty string to "
-            "rerun last query\n\t"
-        )
+        try:
+            keyword_search = False
+            query = input(
+                "\n\nAsk any question:\n  - prefix with -k for keyword search\n  - input an empty string to "
+                "rerun last query\n\t"
+            )
 
-        if query.lower() in ["q", "quit", "exit", "exit()"]:
-            break
+            if query.lower() in ["q", "quit", "exit", "exit()"]:
+                break
 
-        if query:
-            previous_query = query
-        else:
-            if not previous_query:
-                print("No previous query")
-                continue
-            print(f"Re-executing previous question:\n\t{previous_query}")
-            query = previous_query
-
-        endpoint = f"http://127.0.0.1:{APP_PORT}/direct-qa"
-        if query.startswith("-k "):
-            keyword_search = True
-            query = query[2:]
-            endpoint = f"http://127.0.0.1:{APP_PORT}/keyword-search"
-
-        response = requests.post(
-            endpoint, json={"query": query, "collection": QDRANT_DEFAULT_COLLECTION}
-        )
-        contents = json.loads(response.content)
-        if keyword_search:
-            if contents["results"]:
-                for link in contents["results"]:
-                    print(link)
+            if query:
+                previous_query = query
             else:
-                print("No matches found")
-        else:
-            answer = contents.get("answer")
-            if answer:
-                print("Answer: " + answer)
+                if not previous_query:
+                    print("No previous query")
+                    continue
+                print(f"Re-executing previous question:\n\t{previous_query}")
+                query = previous_query
+
+            endpoint = f"http://127.0.0.1:{APP_PORT}/direct-qa"
+            if query.startswith("-k "):
+                keyword_search = True
+                query = query[2:]
+                endpoint = f"http://127.0.0.1:{APP_PORT}/keyword-search"
+
+            response = requests.post(
+                endpoint, json={"query": query, "collection": QDRANT_DEFAULT_COLLECTION}
+            )
+            contents = json.loads(response.content)
+            if keyword_search:
+                if contents["results"]:
+                    for link in contents["results"]:
+                        print(link)
+                else:
+                    print("No matches found")
             else:
-                print("Answer: ?")
-            if contents.get("quotes"):
-                for ind, (quote, quote_info) in enumerate(contents["quotes"].items()):
-                    print(f"Quote {str(ind)}:\n{quote}")
-                    print(f"Link: {quote_info['link']}")
-                    print(f"Source: {quote_info[SOURCE_TYPE]}")
-            else:
-                print("No quotes found")
+                answer = contents.get("answer")
+                if answer:
+                    print("Answer: " + answer)
+                else:
+                    print("Answer: ?")
+                if contents.get("quotes"):
+                    for ind, (quote, quote_info) in enumerate(
+                        contents["quotes"].items()
+                    ):
+                        print(f"Quote {str(ind + 1)}:\n{quote}")
+                        print(f"Link: {quote_info['link']}")
+                        print(f"Source: {quote_info[SOURCE_TYPE]}")
+                else:
+                    print("No quotes found")
+        except Exception as e:
+            print(f"Failed due to {e}, retrying")