mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-26 20:08:38 +02:00
Remove empty files from QA
This commit is contained in:
@@ -13,6 +13,10 @@ EMBEDDINGS = "embeddings"
|
|||||||
ALLOWED_USERS = "allowed_users"
|
ALLOWED_USERS = "allowed_users"
|
||||||
ALLOWED_GROUPS = "allowed_groups"
|
ALLOWED_GROUPS = "allowed_groups"
|
||||||
METADATA = "metadata"
|
METADATA = "metadata"
|
||||||
|
# stored in the `metadata` of a chunk. Used to signify that this chunk should
|
||||||
|
# not be used for QA. For example, Google Drive file types which can't be parsed
|
||||||
|
# are still useful as a search result but not for QA.
|
||||||
|
IGNORE_FOR_QA = "ignore_for_qa"
|
||||||
GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
|
GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
|
||||||
PUBLIC_DOC_PAT = "PUBLIC"
|
PUBLIC_DOC_PAT = "PUBLIC"
|
||||||
QUOTE = "quote"
|
QUOTE = "quote"
|
||||||
|
@@ -19,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
|||||||
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.configs.constants import IGNORE_FOR_QA
|
||||||
from danswer.connectors.google_drive.connector_auth import (
|
from danswer.connectors.google_drive.connector_auth import (
|
||||||
get_google_drive_creds_for_authorized_user,
|
get_google_drive_creds_for_authorized_user,
|
||||||
)
|
)
|
||||||
@@ -467,7 +468,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
|||||||
],
|
],
|
||||||
source=DocumentSource.GOOGLE_DRIVE,
|
source=DocumentSource.GOOGLE_DRIVE,
|
||||||
semantic_identifier=file["name"],
|
semantic_identifier=file["name"],
|
||||||
metadata={},
|
metadata={} if text_contents else {IGNORE_FOR_QA: True},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@@ -4,6 +4,7 @@ from danswer.chunking.models import InferenceChunk
|
|||||||
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
|
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
|
||||||
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
|
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
from danswer.configs.app_configs import QA_TIMEOUT
|
from danswer.configs.app_configs import QA_TIMEOUT
|
||||||
|
from danswer.configs.constants import IGNORE_FOR_QA
|
||||||
from danswer.datastores.document_index import get_default_document_index
|
from danswer.datastores.document_index import get_default_document_index
|
||||||
from danswer.db.feedback import create_query_event
|
from danswer.db.feedback import create_query_event
|
||||||
from danswer.db.models import User
|
from danswer.db.models import User
|
||||||
@@ -99,15 +100,24 @@ def answer_qa_query(
|
|||||||
query_event_id=query_event_id,
|
query_event_id=query_event_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# remove chunks marked as not applicable for QA (e.g. Google Drive file
|
||||||
|
# types which can't be parsed). These chunks are useful to show in the
|
||||||
|
# search results, but not for QA.
|
||||||
|
filtered_ranked_chunks = [
|
||||||
|
chunk for chunk in ranked_chunks if chunk.metadata.get(IGNORE_FOR_QA)
|
||||||
|
]
|
||||||
|
|
||||||
chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
|
chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
if chunk_offset >= len(ranked_chunks):
|
if chunk_offset >= len(filtered_ranked_chunks):
|
||||||
raise ValueError("Chunks offset too large, should not retry this many times")
|
raise ValueError("Chunks offset too large, should not retry this many times")
|
||||||
|
|
||||||
error_msg = None
|
error_msg = None
|
||||||
try:
|
try:
|
||||||
answer, quotes = qa_model.answer_question(
|
answer, quotes = qa_model.answer_question(
|
||||||
query,
|
query,
|
||||||
ranked_chunks[chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS],
|
filtered_ranked_chunks[
|
||||||
|
chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
|
],
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# exception is logged in the answer_question method, no need to re-log
|
# exception is logged in the answer_question method, no need to re-log
|
||||||
|
@@ -10,6 +10,7 @@ from danswer.auth.users import current_user
|
|||||||
from danswer.chunking.models import InferenceChunk
|
from danswer.chunking.models import InferenceChunk
|
||||||
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
|
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
|
||||||
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
|
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
|
from danswer.configs.constants import IGNORE_FOR_QA
|
||||||
from danswer.datastores.document_index import get_default_document_index
|
from danswer.datastores.document_index import get_default_document_index
|
||||||
from danswer.db.engine import get_session
|
from danswer.db.engine import get_session
|
||||||
from danswer.db.feedback import create_doc_retrieval_feedback
|
from danswer.db.feedback import create_doc_retrieval_feedback
|
||||||
@@ -239,15 +240,22 @@ def stream_direct_qa(
|
|||||||
yield get_json_line({"error": str(e)})
|
yield get_json_line({"error": str(e)})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# remove chunks marked as not applicable for QA (e.g. Google Drive file
|
||||||
|
# types which can't be parsed). These chunks are useful to show in the
|
||||||
|
# search results, but not for QA.
|
||||||
|
filtered_ranked_chunks = [
|
||||||
|
chunk for chunk in ranked_chunks if chunk.metadata.get(IGNORE_FOR_QA)
|
||||||
|
]
|
||||||
|
|
||||||
chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
|
chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
if chunk_offset >= len(ranked_chunks):
|
if chunk_offset >= len(filtered_ranked_chunks):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Chunks offset too large, should not retry this many times"
|
"Chunks offset too large, should not retry this many times"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
for response_packet in qa_model.answer_question_stream(
|
for response_packet in qa_model.answer_question_stream(
|
||||||
query,
|
query,
|
||||||
ranked_chunks[
|
filtered_ranked_chunks[
|
||||||
chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS
|
chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS
|
||||||
],
|
],
|
||||||
):
|
):
|
||||||
|
Reference in New Issue
Block a user