mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 11:58:34 +02:00
Regression Test (#434)
This commit is contained in:
parent
101ff2f392
commit
b5fc2a5775
@ -30,6 +30,7 @@ from danswer.llm.llm import LLM
|
||||
from danswer.llm.utils import dict_based_prompt_to_langchain_prompt
|
||||
from danswer.llm.utils import str_prompt_to_langchain_prompt
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.text_processing import clean_up_code_blocks
|
||||
from danswer.utils.text_processing import escape_newlines
|
||||
|
||||
logger = setup_logger()
|
||||
@ -155,12 +156,14 @@ class SingleMessageScratchpadHandler(QAHandler):
|
||||
) -> tuple[DanswerAnswer, DanswerQuotes]:
|
||||
logger.debug(model_output)
|
||||
|
||||
answer_start = model_output.find('{"answer":')
|
||||
model_clean = clean_up_code_blocks(model_output)
|
||||
|
||||
answer_start = model_clean.find('{"answer":')
|
||||
# Only found thoughts, no final answer
|
||||
if answer_start == -1:
|
||||
return DanswerAnswer(answer=None), DanswerQuotes(quotes=[])
|
||||
|
||||
final_json = escape_newlines(model_output[answer_start:])
|
||||
final_json = escape_newlines(model_clean[answer_start:])
|
||||
|
||||
return process_answer(
|
||||
final_json, context_chunks, is_json_prompt=self.is_json_output
|
||||
|
@ -8,6 +8,10 @@ def escape_newlines(s: str) -> str:
|
||||
return re.sub(r"(?<!\\)\n", "\\\\n", s)
|
||||
|
||||
|
||||
def clean_up_code_blocks(model_out_raw: str) -> str:
|
||||
return model_out_raw.strip().strip("```").strip()
|
||||
|
||||
|
||||
def clean_model_quote(quote: str, trim_length: int) -> str:
|
||||
quote_clean = quote.strip()
|
||||
if quote_clean[0] == '"':
|
||||
|
@ -4,6 +4,7 @@ mypy==1.1.1
|
||||
pre-commit==3.2.2
|
||||
reorder-python-imports==3.9.0
|
||||
ruff==0.0.286
|
||||
types-PyYAML==6.0.12.11
|
||||
types-beautifulsoup4==4.12.0.3
|
||||
types-html5lib==1.1.11.13
|
||||
types-oauthlib==3.2.0.9
|
||||
|
117
backend/tests/regression/answer_quality/eval_direct_qa.py
Normal file
117
backend/tests/regression/answer_quality/eval_direct_qa.py
Normal file
@ -0,0 +1,117 @@
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import yaml
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.direct_qa.answer_question import answer_qa_query
|
||||
from danswer.server.models import QuestionRequest
|
||||
|
||||
|
||||
engine = get_sqlalchemy_engine()
|
||||
|
||||
|
||||
def load_yaml(filepath: str) -> dict:
|
||||
with open(filepath, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
return data
|
||||
|
||||
|
||||
def word_wrap(s: str, max_line_size: int = 120) -> str:
|
||||
words = s.split()
|
||||
|
||||
current_line: list[str] = []
|
||||
result_lines: list[str] = []
|
||||
current_length = 0
|
||||
for word in words:
|
||||
if len(word) > max_line_size:
|
||||
if current_line:
|
||||
result_lines.append(" ".join(current_line))
|
||||
current_line = []
|
||||
current_length = 0
|
||||
|
||||
result_lines.append(word)
|
||||
continue
|
||||
|
||||
if current_length + len(word) + len(current_line) > max_line_size:
|
||||
result_lines.append(" ".join(current_line))
|
||||
current_line = []
|
||||
current_length = 0
|
||||
|
||||
current_line.append(word)
|
||||
current_length += len(word)
|
||||
|
||||
if current_line:
|
||||
result_lines.append(" ".join(current_line))
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
|
||||
def get_answer_for_question(query: str, db_session: Session) -> str | None:
|
||||
question = QuestionRequest(
|
||||
query=query,
|
||||
collection="danswer_index",
|
||||
use_keyword=None,
|
||||
filters=None,
|
||||
offset=None,
|
||||
)
|
||||
|
||||
answer = answer_qa_query(
|
||||
question=question,
|
||||
user=None,
|
||||
db_session=db_session,
|
||||
answer_generation_timeout=100,
|
||||
real_time_flow=False,
|
||||
)
|
||||
|
||||
return answer.answer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"regression_yaml",
|
||||
type=str,
|
||||
help="Path to the Questions YAML file.",
|
||||
default="./tests/regression/answer_quality/sample_questions.yaml",
|
||||
nargs="?",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--real-time", action="store_true", help="Set to use the real-time flow."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="Path to the output results file.",
|
||||
default="./tests/regression/answer_quality/regression_results.txt",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
questions_data = load_yaml(args.regression_yaml)
|
||||
|
||||
with open(args.output, "w") as outfile:
|
||||
print("Running Question Answering Flow", file=outfile)
|
||||
|
||||
with Session(engine, expire_on_commit=False) as db_session:
|
||||
for sample in questions_data["questions"]:
|
||||
# This line goes to stdout to track progress
|
||||
print(f"Running Test for Question {sample['id']}: {sample['question']}")
|
||||
|
||||
start_time = datetime.now()
|
||||
answer = get_answer_for_question(sample["question"], db_session)
|
||||
end_time = datetime.now()
|
||||
|
||||
print(f"====Duration: {end_time - start_time}====", file=outfile)
|
||||
print(f"Question {sample['id']}:", file=outfile)
|
||||
print(sample["question"], file=outfile)
|
||||
print("\nExpected Answer:", file=outfile)
|
||||
print(sample["expected_answer"], file=outfile)
|
||||
print("\nActual Answer:", file=outfile)
|
||||
print(
|
||||
word_wrap(answer)
|
||||
if answer
|
||||
else "Failed, either crashed or refused to answer.",
|
||||
file=outfile,
|
||||
)
|
||||
print("\n\n", file=outfile, flush=True)
|
@ -0,0 +1,96 @@
|
||||
# This YAML file contains regression questions for Danswer.
|
||||
# The sources mentioned are the same ones to power the DanswerBot for the community's use
|
||||
# The regression flow assumes the data from the sources listed are already indexed
|
||||
|
||||
metadata:
|
||||
version: v0.0.1
|
||||
date: 2023-09-10
|
||||
sources:
|
||||
- name: web
|
||||
detail: https://www.danswer.ai/
|
||||
- name: web
|
||||
detail: https://docs.danswer.dev/
|
||||
- name: github issues
|
||||
detail: danswer-ai/danswer
|
||||
- name: github pull-requests
|
||||
detail: danswer-ai/danswer
|
||||
- name: slack
|
||||
workspace: danswer.slack.com
|
||||
- name: file
|
||||
detail: Markdown files from Danswer repo
|
||||
|
||||
questions:
|
||||
- id: 1
|
||||
question: "What is Danswer?"
|
||||
expected_answer: "Danswer is an open source question-answering system."
|
||||
notes: "This comes directly from the docs, the actual answer should be more informative"
|
||||
|
||||
- id: 2
|
||||
question: "What is Danswer licensed under?"
|
||||
expected_answer: "Danswer is MIT licensed"
|
||||
notes: "This info can be found in many places"
|
||||
|
||||
- id: 3
|
||||
question: "What are the required variables to set to use GPT-4?"
|
||||
expected_answer: "Set the environment variables INTERNAL_MODEL_VERSION=openai-chat-completion and GEN_AI_MODEL_VERSION=gpt-4"
|
||||
notes: "Two env vars are must have, the third (the key) is optional"
|
||||
|
||||
- id: 4
|
||||
question: "Why might I want to use the deberta model for QnA?"
|
||||
expected_answer: "This kind of model can run on CPU and are less likely to produce hallucinations"
|
||||
notes: "https://docs.danswer.dev/gen_ai_configs/transformers, this is a pretty hard question"
|
||||
|
||||
- id: 5
|
||||
question: "What auth related tokens do I need for BookStack?"
|
||||
expected_answer: "You will need the API Token ID and the API Token Secret"
|
||||
notes: "https://docs.danswer.dev/connectors/bookstack"
|
||||
|
||||
- id: 6
|
||||
question: "ValueError: invalid literal for int() with base 10"
|
||||
expected_answer: "This was a bug that was fixed shortly after the issue was filed. Try updating the code."
|
||||
notes: "This question is in Github Issue #290"
|
||||
|
||||
- id: 7
|
||||
question: "Is there support for knowledge sets or document sets?"
|
||||
expected_answer: "This was requested and approved however it is not clear if the feature is implemented yet."
|
||||
notes: "This question is in Github Issue #338"
|
||||
|
||||
- id: 8
|
||||
question: "nginx returning 502"
|
||||
expected_answer: "Google OAuth must be configured for Danswer backend to work. A PR was created to fix it"
|
||||
notes: "This question is in Github Issue #260"
|
||||
|
||||
- id: 9
|
||||
question: "Why isn't GPT4All enabled by default"
|
||||
expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac."
|
||||
notes: "This question is in Github Issue #232 but also mentioned in several other places"
|
||||
|
||||
- id: 10
|
||||
question: "Why isn't GPT4All enabled by default"
|
||||
expected_answer: "There is no recent version of GPT4All that is compatible with M1 Mac."
|
||||
notes: "This question is in Github Issue #232 but also mentioned in several other places"
|
||||
|
||||
- id: 11
|
||||
question: "Why are the models warmed up on server start"
|
||||
expected_answer: "This ensures that the first indexing isn't really slow."
|
||||
notes: "This is in Github PR #333"
|
||||
|
||||
- id: 12
|
||||
question: "Why are the models warmed up on server start"
|
||||
expected_answer: "This ensures that the first indexing isn't really slow."
|
||||
notes: "This is in Github PR #333"
|
||||
|
||||
- id: 13
|
||||
question: "What text from the Alation Connector is used to generate the docs?"
|
||||
expected_answer: "Articles are used with the body contents. Schemas, Tables, and Columns use Description"
|
||||
notes: "This is in Github PR #161"
|
||||
|
||||
- id: 14
|
||||
question: "Does Danswer support PDFs in Google Drive?"
|
||||
expected_answer: "Yes"
|
||||
notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well"
|
||||
|
||||
- id: 15
|
||||
question: "I deleted a connector in Danswer but some deleted docs are still showing in search"
|
||||
expected_answer: "The issue was fixed via a code change, it should go away after pulling the latest code"
|
||||
notes: "This question is in Slack, if the message expires due to using free slack version, the info may be gone as well"
|
Loading…
x
Reference in New Issue
Block a user