From b5fc2a5775e2357b2a5851df0f9ef3b60f5b5633 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Mon, 11 Sep 2023 19:06:01 -0700
Subject: [PATCH] Regression Test (#434)

---
 backend/danswer/direct_qa/qa_block.py         |   7 +-
 backend/danswer/utils/text_processing.py      |   4 +
 backend/requirements/dev.txt                  |   1 +
 .../answer_quality/eval_direct_qa.py          | 117 ++++++++++++++++++
 .../answer_quality/sample_questions.yaml      |  96 ++++++++++++++
 5 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 backend/tests/regression/answer_quality/eval_direct_qa.py
 create mode 100644 backend/tests/regression/answer_quality/sample_questions.yaml

diff --git a/backend/danswer/direct_qa/qa_block.py b/backend/danswer/direct_qa/qa_block.py
index dcc6e6fc59..fd3fe45f5f 100644
--- a/backend/danswer/direct_qa/qa_block.py
+++ b/backend/danswer/direct_qa/qa_block.py
@@ -30,6 +30,7 @@ from danswer.llm.llm import LLM
 from danswer.llm.utils import dict_based_prompt_to_langchain_prompt
 from danswer.llm.utils import str_prompt_to_langchain_prompt
 from danswer.utils.logger import setup_logger
+from danswer.utils.text_processing import clean_up_code_blocks
 from danswer.utils.text_processing import escape_newlines
 
 logger = setup_logger()
@@ -155,12 +156,14 @@ class SingleMessageScratchpadHandler(QAHandler):
     ) -> tuple[DanswerAnswer, DanswerQuotes]:
         logger.debug(model_output)
 
-        answer_start = model_output.find('{"answer":')
+        model_clean = clean_up_code_blocks(model_output)
+
+        answer_start = model_clean.find('{"answer":')
         # Only found thoughts, no final answer
         if answer_start == -1:
             return DanswerAnswer(answer=None), DanswerQuotes(quotes=[])
 
-        final_json = escape_newlines(model_output[answer_start:])
+        final_json = escape_newlines(model_clean[answer_start:])
 
         return process_answer(
             final_json, context_chunks, is_json_prompt=self.is_json_output
diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py
index fbd4885121..696f13391e 100644
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -8,6 +8,10 @@ def escape_newlines(s: str) -> str:
     return re.sub(r"(?<!\\)\n", "\\\\n", s)
 
 
+def clean_up_code_blocks(model_out_raw: str) -> str:
+    return model_out_raw.strip().strip("```").strip()
+
+
 def clean_model_quote(quote: str, trim_length: int) -> str:
     quote_clean = quote.strip()
     if quote_clean[0] == '"':
diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt
index d5755e63d0..45d0f1f736 100644
--- a/backend/requirements/dev.txt
+++ b/backend/requirements/dev.txt
@@ -4,6 +4,7 @@ mypy==1.1.1
 pre-commit==3.2.2
 reorder-python-imports==3.9.0
 ruff==0.0.286
+types-PyYAML==6.0.12.11
 types-beautifulsoup4==4.12.0.3
 types-html5lib==1.1.11.13
 types-oauthlib==3.2.0.9
diff --git a/backend/tests/regression/answer_quality/eval_direct_qa.py b/backend/tests/regression/answer_quality/eval_direct_qa.py
new file mode 100644
index 0000000000..0ed9813091
--- /dev/null
+++ b/backend/tests/regression/answer_quality/eval_direct_qa.py
@@ -0,0 +1,117 @@
+import argparse
+from datetime import datetime
+
+import yaml
+from sqlalchemy.orm import Session
+
+from danswer.db.engine import get_sqlalchemy_engine
+from danswer.direct_qa.answer_question import answer_qa_query
+from danswer.server.models import QuestionRequest
+
+
+engine = get_sqlalchemy_engine()
+
+
+def load_yaml(filepath: str) -> dict:
+    with open(filepath, "r") as file:
+        data = yaml.safe_load(file)
+    return data
+
+
+def word_wrap(s: str, max_line_size: int = 120) -> str:
+    words = s.split()
+
+    current_line: list[str] = []
+    result_lines: list[str] = []
+    current_length = 0
+    for word in words:
+        if len(word) > max_line_size:
+            if current_line:
+                result_lines.append(" ".join(current_line))
+                current_line = []
+                current_length = 0
+
+            result_lines.append(word)
+            continue
+
+        if current_length + len(word) + len(current_line) > max_line_size:
+            result_lines.append(" ".join(current_line))
+            current_line = []
+            current_length = 0
+
+        current_line.append(word)
+        current_length += len(word)
+
+    if current_line:
+        result_lines.append(" ".join(current_line))
+
+    return "\n".join(result_lines)
+
+
+def get_answer_for_question(query: str, db_session: Session) -> str | None:
+    question = QuestionRequest(
+        query=query,
+        collection="danswer_index",
+        use_keyword=None,
+        filters=None,
+        offset=None,
+    )
+
+    answer = answer_qa_query(
+        question=question,
+        user=None,
+        db_session=db_session,
+        answer_generation_timeout=100,
+        real_time_flow=False,
+    )
+
+    return answer.answer
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "regression_yaml",
+        type=str,
+        help="Path to the Questions YAML file.",
+        default="./tests/regression/answer_quality/sample_questions.yaml",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--real-time", action="store_true", help="Set to use the real-time flow."
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Path to the output results file.",
+        default="./tests/regression/answer_quality/regression_results.txt",
+    )
+    args = parser.parse_args()
+
+    questions_data = load_yaml(args.regression_yaml)
+
+    with open(args.output, "w") as outfile:
+        print("Running Question Answering Flow", file=outfile)
+
+        with Session(engine, expire_on_commit=False) as db_session:
+            for sample in questions_data["questions"]:
+                # This line goes to stdout to track progress
+                print(f"Running Test for Question {sample['id']}: {sample['question']}")
+
+                start_time = datetime.now()
+                answer = get_answer_for_question(sample["question"], db_session)
+                end_time = datetime.now()
+
+                print(f"====Duration: {end_time - start_time}====", file=outfile)
+                print(f"Question {sample['id']}:", file=outfile)
+                print(sample["question"], file=outfile)
+                print("\nExpected Answer:", file=outfile)
+                print(sample["expected_answer"], file=outfile)
+                print("\nActual Answer:", file=outfile)
+                print(
+                    word_wrap(answer)
+                    if answer
+                    else "Failed, either crashed or refused to answer.",
+                    file=outfile,
+                )
+                print("\n\n", file=outfile, flush=True)
diff --git a/backend/tests/regression/answer_quality/sample_questions.yaml b/backend/tests/regression/answer_quality/sample_questions.yaml
new file mode 100644
index 0000000000..33bedbe9cf
--- /dev/null
+++ b/backend/tests/regression/answer_quality/sample_questions.yaml
@@ -0,0 +1,96 @@
+# This YAML file contains regression questions for Danswer.
+# The sources mentioned are the same ones to power the DanswerBot for the community's use
+# The regression flow assumes the data from the sources listed are already indexed
+
+metadata:
+  version: v0.0.1
+  date: 2023-09-10
+  sources:
+    - name: web
+      detail: https://www.danswer.ai/
+    - name: web
+      detail: https://docs.danswer.dev/
+    - name: github issues
+      detail: danswer-ai/danswer
+    - name: github pull-requests
+      detail: danswer-ai/danswer
+    - name: slack
+      workspace: danswer.slack.com
+    - name: file
+      detail: Markdown files from Danswer repo
+
+questions:
+  - id: 1
+    question: "What is Danswer?"
+    expected_answer: "Danswer is an open source question-answering system."
+    notes: "This comes directly from the docs, the actual answer should be more informative"
+
+  - id: 2
+    question: "What is Danswer licensed under?"
+    expected_answer: "Danswer is MIT licensed"
+    notes: "This info can be found in many places"
+
+  - id: 3
+    question: "What are the required variables to set to use GPT-4?"
+    expected_answer: "Set the environment variables INTERNAL_MODEL_VERSION=openai-chat-completion and GEN_AI_MODEL_VERSION=gpt-4"
+    notes: "Two env vars are must have, the third (the key) is optional"
+
+  - id: 4
+    question: "Why might I want to use the deberta model for QnA?"
+    expected_answer: "This kind of model can run on CPU and are less likely to produce hallucinations"
+    notes: "https://docs.danswer.dev/gen_ai_configs/transformers, this is a pretty hard question"
+
+  - id: 5
+    question: "What auth related tokens do I need for BookStack?"
+    expected_answer: "You will need the API Token ID and the API Token Secret"
+    notes: "https://docs.danswer.dev/connectors/bookstack"
+
+  - id: 6
+    question: "ValueError: invalid literal for int() with base 10"
+    expected_answer: "This was a bug that was fixed shortly after the issue was filed. Try updating the code."
+    notes: "This question is in Github Issue #290"
+
+  - id: 7
+    question: "Is there support for knowledge sets or document sets?"
+    expected_answer: "This was requested and approved however it is not clear if the feature is implemented yet."
+    notes: "This question is in Github Issue #338"
+
+  - id: 8
+    question: "nginx returning 502"
+    expected_answer: "Google OAuth must be configured for Danswer backend to work. A PR was created to fix it"
+    notes: "This question is in Github Issue #260"
+
+  - id: 9
+    question: "Why isn't GPT4All enabled by default"
+    expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac."
+    notes: "This question is in Github Issue #232 but also mentioned in several other places"
+
+  - id: 10
+    question: "Why isn't GPT4All enabled by default"
+    expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac."
+    notes: "This question is in Github Issue #232 but also mentioned in several other places"
+
+  - id: 11
+    question: "Why are the models warmed up on server start"
+    expected_answer: "This ensures that the first indexing isn't really slow."
+    notes: "This is in Github PR #333"
+
+  - id: 12
+    question: "Why are the models warmed up on server start"
+    expected_answer: "This ensures that the first indexing isn't really slow."
+    notes: "This is in Github PR #333"
+
+  - id: 13
+    question: "What text from the Alation Connector is used to generate the docs?"
+    expected_answer: "Articles are used with the body contents. Schemas, Tables, and Columns use Description"
+    notes: "This is in Github PR #161"
+
+  - id: 14
+    question: "Does Danswer support PDFs in Google Drive?"
+    expected_answer: "Yes"
+    notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well"
+
+  - id: 15
+    question: "I deleted a connector in Danswer but some deleted docs are still showing in search"
+    expected_answer: "The issue was fixed via a code change, it should go away after pulling the latest code"
+    notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well"