From 742a016175d5de22860f34354fa0cffa0948f224 Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 4 Sep 2023 11:02:52 -0700 Subject: [PATCH] Remove empty files from QA --- backend/danswer/configs/constants.py | 4 ++++ .../danswer/connectors/google_drive/connector.py | 3 ++- backend/danswer/direct_qa/answer_question.py | 14 ++++++++++++-- backend/danswer/server/search_backend.py | 12 ++++++++++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 2038272e2..d3d52a7a9 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -13,6 +13,10 @@ EMBEDDINGS = "embeddings" ALLOWED_USERS = "allowed_users" ALLOWED_GROUPS = "allowed_groups" METADATA = "metadata" +# stored in the `metadata` of a chunk. Used to signify that this chunk should +# not be used for QA. For example, Google Drive file types which can't be parsed +# are still useful as a search result but not for QA. +IGNORE_FOR_QA = "ignore_for_qa" GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key" PUBLIC_DOC_PAT = "PUBLIC" QUOTE = "quote" diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index c13281ca5..910f11ad8 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -19,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.configs.constants import IGNORE_FOR_QA from danswer.connectors.google_drive.connector_auth import ( get_google_drive_creds_for_authorized_user, ) @@ -467,7 +468,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector): ], source=DocumentSource.GOOGLE_DRIVE, semantic_identifier=file["name"], - metadata={}, + metadata={} if text_contents else {IGNORE_FOR_QA: True}, ) ) except Exception as e: diff --git a/backend/danswer/direct_qa/answer_question.py b/backend/danswer/direct_qa/answer_question.py index d810f323a..36c505198 100644 --- a/backend/danswer/direct_qa/answer_question.py +++ b/backend/danswer/direct_qa/answer_question.py @@ -4,6 +4,7 @@ from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DISABLE_GENERATIVE_AI from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS from danswer.configs.app_configs import QA_TIMEOUT +from danswer.configs.constants import IGNORE_FOR_QA from danswer.datastores.document_index import get_default_document_index from danswer.db.feedback import create_query_event from danswer.db.models import User @@ -99,15 +100,24 @@ def answer_qa_query( query_event_id=query_event_id, ) + # remove chunks marked as not applicable for QA (e.g. Google Drive file + # types which can't be parsed). These chunks are useful to show in the + # search results, but not for QA. + filtered_ranked_chunks = [ + chunk for chunk in ranked_chunks if chunk.metadata.get(IGNORE_FOR_QA) + ] + chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS - if chunk_offset >= len(ranked_chunks): + if chunk_offset >= len(filtered_ranked_chunks): raise ValueError("Chunks offset too large, should not retry this many times") error_msg = None try: answer, quotes = qa_model.answer_question( query, - ranked_chunks[chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS], + filtered_ranked_chunks[ + chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS + ], ) except Exception as e: # exception is logged in the answer_question method, no need to re-log diff --git a/backend/danswer/server/search_backend.py b/backend/danswer/server/search_backend.py index c2402bebe..193eff0ff 100644 --- a/backend/danswer/server/search_backend.py +++ b/backend/danswer/server/search_backend.py @@ -10,6 +10,7 @@ from danswer.auth.users import current_user from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DISABLE_GENERATIVE_AI from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS +from danswer.configs.constants import IGNORE_FOR_QA from danswer.datastores.document_index import get_default_document_index from danswer.db.engine import get_session from danswer.db.feedback import create_doc_retrieval_feedback @@ -239,15 +240,22 @@ def stream_direct_qa( yield get_json_line({"error": str(e)}) return + # remove chunks marked as not applicable for QA (e.g. Google Drive file + # types which can't be parsed). These chunks are useful to show in the + # search results, but not for QA. + filtered_ranked_chunks = [ + chunk for chunk in ranked_chunks if chunk.metadata.get(IGNORE_FOR_QA) + ] + chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS - if chunk_offset >= len(ranked_chunks): + if chunk_offset >= len(filtered_ranked_chunks): raise ValueError( "Chunks offset too large, should not retry this many times" ) try: for response_packet in qa_model.answer_question_stream( query, - ranked_chunks[ + filtered_ranked_chunks[ chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS ], ):