From b5fc2a5775e2357b2a5851df0f9ef3b60f5b5633 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 11 Sep 2023 19:06:01 -0700 Subject: [PATCH] Regression Test (#434) --- backend/danswer/direct_qa/qa_block.py | 7 +- backend/danswer/utils/text_processing.py | 4 + backend/requirements/dev.txt | 1 + .../answer_quality/eval_direct_qa.py | 117 ++++++++++++++++++ .../answer_quality/sample_questions.yaml | 96 ++++++++++++++ 5 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 backend/tests/regression/answer_quality/eval_direct_qa.py create mode 100644 backend/tests/regression/answer_quality/sample_questions.yaml diff --git a/backend/danswer/direct_qa/qa_block.py b/backend/danswer/direct_qa/qa_block.py index dcc6e6fc5..fd3fe45f5 100644 --- a/backend/danswer/direct_qa/qa_block.py +++ b/backend/danswer/direct_qa/qa_block.py @@ -30,6 +30,7 @@ from danswer.llm.llm import LLM from danswer.llm.utils import dict_based_prompt_to_langchain_prompt from danswer.llm.utils import str_prompt_to_langchain_prompt from danswer.utils.logger import setup_logger +from danswer.utils.text_processing import clean_up_code_blocks from danswer.utils.text_processing import escape_newlines logger = setup_logger() @@ -155,12 +156,14 @@ class SingleMessageScratchpadHandler(QAHandler): ) -> tuple[DanswerAnswer, DanswerQuotes]: logger.debug(model_output) - answer_start = model_output.find('{"answer":') + model_clean = clean_up_code_blocks(model_output) + + answer_start = model_clean.find('{"answer":') # Only found thoughts, no final answer if answer_start == -1: return DanswerAnswer(answer=None), DanswerQuotes(quotes=[]) - final_json = escape_newlines(model_output[answer_start:]) + final_json = escape_newlines(model_clean[answer_start:]) return process_answer( final_json, context_chunks, is_json_prompt=self.is_json_output diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index fbd488512..696f13391 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -8,6 +8,10 @@ def escape_newlines(s: str) -> str: return re.sub(r"(? str: + return model_out_raw.strip().strip("```").strip() + + def clean_model_quote(quote: str, trim_length: int) -> str: quote_clean = quote.strip() if quote_clean[0] == '"': diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt index d5755e63d..45d0f1f73 100644 --- a/backend/requirements/dev.txt +++ b/backend/requirements/dev.txt @@ -4,6 +4,7 @@ mypy==1.1.1 pre-commit==3.2.2 reorder-python-imports==3.9.0 ruff==0.0.286 +types-PyYAML==6.0.12.11 types-beautifulsoup4==4.12.0.3 types-html5lib==1.1.11.13 types-oauthlib==3.2.0.9 diff --git a/backend/tests/regression/answer_quality/eval_direct_qa.py b/backend/tests/regression/answer_quality/eval_direct_qa.py new file mode 100644 index 000000000..0ed981309 --- /dev/null +++ b/backend/tests/regression/answer_quality/eval_direct_qa.py @@ -0,0 +1,117 @@ +import argparse +from datetime import datetime + +import yaml +from sqlalchemy.orm import Session + +from danswer.db.engine import get_sqlalchemy_engine +from danswer.direct_qa.answer_question import answer_qa_query +from danswer.server.models import QuestionRequest + + +engine = get_sqlalchemy_engine() + + +def load_yaml(filepath: str) -> dict: + with open(filepath, "r") as file: + data = yaml.safe_load(file) + return data + + +def word_wrap(s: str, max_line_size: int = 120) -> str: + words = s.split() + + current_line: list[str] = [] + result_lines: list[str] = [] + current_length = 0 + for word in words: + if len(word) > max_line_size: + if current_line: + result_lines.append(" ".join(current_line)) + current_line = [] + current_length = 0 + + result_lines.append(word) + continue + + if current_length + len(word) + len(current_line) > max_line_size: + result_lines.append(" ".join(current_line)) + current_line = [] + current_length = 0 + + current_line.append(word) + current_length += len(word) + + if current_line: + result_lines.append(" ".join(current_line)) + + return "\n".join(result_lines) + + +def get_answer_for_question(query: str, db_session: Session) -> str | None: + question = QuestionRequest( + query=query, + collection="danswer_index", + use_keyword=None, + filters=None, + offset=None, + ) + + answer = answer_qa_query( + question=question, + user=None, + db_session=db_session, + answer_generation_timeout=100, + real_time_flow=False, + ) + + return answer.answer + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "regression_yaml", + type=str, + help="Path to the Questions YAML file.", + default="./tests/regression/answer_quality/sample_questions.yaml", + nargs="?", + ) + parser.add_argument( + "--real-time", action="store_true", help="Set to use the real-time flow." + ) + parser.add_argument( + "--output", + type=str, + help="Path to the output results file.", + default="./tests/regression/answer_quality/regression_results.txt", + ) + args = parser.parse_args() + + questions_data = load_yaml(args.regression_yaml) + + with open(args.output, "w") as outfile: + print("Running Question Answering Flow", file=outfile) + + with Session(engine, expire_on_commit=False) as db_session: + for sample in questions_data["questions"]: + # This line goes to stdout to track progress + print(f"Running Test for Question {sample['id']}: {sample['question']}") + + start_time = datetime.now() + answer = get_answer_for_question(sample["question"], db_session) + end_time = datetime.now() + + print(f"====Duration: {end_time - start_time}====", file=outfile) + print(f"Question {sample['id']}:", file=outfile) + print(sample["question"], file=outfile) + print("\nExpected Answer:", file=outfile) + print(sample["expected_answer"], file=outfile) + print("\nActual Answer:", file=outfile) + print( + word_wrap(answer) + if answer + else "Failed, either crashed or refused to answer.", + file=outfile, + ) + print("\n\n", file=outfile, flush=True) diff --git a/backend/tests/regression/answer_quality/sample_questions.yaml b/backend/tests/regression/answer_quality/sample_questions.yaml new file mode 100644 index 000000000..33bedbe9c --- /dev/null +++ b/backend/tests/regression/answer_quality/sample_questions.yaml @@ -0,0 +1,96 @@ +# This YAML file contains regression questions for Danswer. +# The sources mentioned are the same ones to power the DanswerBot for the community's use +# The regression flow assumes the data from the sources listed are already indexed + +metadata: + version: v0.0.1 + date: 2023-09-10 + sources: + - name: web + detail: https://www.danswer.ai/ + - name: web + detail: https://docs.danswer.dev/ + - name: github issues + detail: danswer-ai/danswer + - name: github pull-requests + detail: danswer-ai/danswer + - name: slack + workspace: danswer.slack.com + - name: file + detail: Markdown files from Danswer repo + +questions: + - id: 1 + question: "What is Danswer?" + expected_answer: "Danswer is an open source question-answering system." + notes: "This comes directly from the docs, the actual answer should be more informative" + + - id: 2 + question: "What is Danswer licensed under?" + expected_answer: "Danswer is MIT licensed" + notes: "This info can be found in many places" + + - id: 3 + question: "What are the required variables to set to use GPT-4?" + expected_answer: "Set the environment variables INTERNAL_MODEL_VERSION=openai-chat-completion and GEN_AI_MODEL_VERSION=gpt-4" + notes: "Two env vars are must have, the third (the key) is optional" + + - id: 4 + question: "Why might I want to use the deberta model for QnA?" + expected_answer: "This kind of model can run on CPU and are less likely to produce hallucinations" + notes: "https://docs.danswer.dev/gen_ai_configs/transformers, this is a pretty hard question" + + - id: 5 + question: "What auth related tokens do I need for BookStack?" + expected_answer: "You will need the API Token ID and the API Token Secret" + notes: "https://docs.danswer.dev/connectors/bookstack" + + - id: 6 + question: "ValueError: invalid literal for int() with base 10" + expected_answer: "This was a bug that was fixed shortly after the issue was filed. Try updating the code." + notes: "This question is in Github Issue #290" + + - id: 7 + question: "Is there support for knowledge sets or document sets?" + expected_answer: "This was requested and approved however it is not clear if the feature is implemented yet." + notes: "This question is in Github Issue #338" + + - id: 8 + question: "nginx returning 502" + expected_answer: "Google OAuth must be configured for Danswer backend to work. A PR was created to fix it" + notes: "This question is in Github Issue #260" + + - id: 9 + question: "Why isn't GPT4All enabled by default" + expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac." + notes: "This question is in Github Issue #232 but also mentioned in several other places" + + - id: 10 + question: "Why isn't GPT4All enabled by default" + expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac." + notes: "This question is in Github Issue #232 but also mentioned in several other places" + + - id: 11 + question: "Why are the models warmed up on server start" + expected_answer: "This ensures that the first indexing isn't really slow." + notes: "This is in Github PR #333" + + - id: 12 + question: "Why are the models warmed up on server start" + expected_answer: "This ensures that the first indexing isn't really slow." + notes: "This is in Github PR #333" + + - id: 13 + question: "What text from the Alation Connector is used to generate the docs?" + expected_answer: "Articles are used with the body contents. Schemas, Tables, and Columns use Description" + notes: "This is in Github PR #161" + + - id: 14 + question: "Does Danswer support PDFs in Google Drive?" + expected_answer: "Yes" + notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well" + + - id: 15 + question: "I deleted a connector in Danswer but some deleted docs are still showing in search" + expected_answer: "The issue was fixed via a code change, it should go away after pulling the latest code" + notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well"