From 9217243e3e295cc68d6c3fad1bfdb2db0c2fffab Mon Sep 17 00:00:00 2001 From: rkuo-danswer Date: Fri, 7 Mar 2025 11:52:30 -0800 Subject: [PATCH] Bugfix/query history notes (#4204) * early work in progress * rename utility script * move actual data seeding to a shareable function * add test * make the test pass with the fix * fix comment * slight improvements and notes to query history and seeding * update test --------- Co-authored-by: Richard Kuo (Danswer) --- backend/ee/onyx/db/usage_export.py | 2 ++ backend/ee/onyx/server/query_history/api.py | 7 ++++ .../onyx/db/seeding/chat_history_seeding.py | 36 ++++++++++++++++--- .../tests/query_history/test_usage_reports.py | 4 ++- 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/backend/ee/onyx/db/usage_export.py b/backend/ee/onyx/db/usage_export.py index 9cd9756ce..17b981579 100644 --- a/backend/ee/onyx/db/usage_export.py +++ b/backend/ee/onyx/db/usage_export.py @@ -27,6 +27,8 @@ def get_empty_chat_messages_entries__paginated( first element is the most recent timestamp out of the sessions iterated - this timestamp can be used to paginate forward in time second element is a list of messages belonging to all the sessions iterated + + Only messages of type USER are returned """ chat_sessions = fetch_chat_sessions_eagerly_by_time( start=period[0], diff --git a/backend/ee/onyx/server/query_history/api.py b/backend/ee/onyx/server/query_history/api.py index 7f1e96a71..8e31ecd81 100644 --- a/backend/ee/onyx/server/query_history/api.py +++ b/backend/ee/onyx/server/query_history/api.py @@ -48,10 +48,15 @@ def fetch_and_process_chat_session_history( feedback_type: QAFeedbackType | None, limit: int | None = 500, ) -> list[ChatSessionSnapshot]: + # observed to be slow a scale of 8192 sessions and 4 messages per session + + # this is a little slow (5 seconds) chat_sessions = fetch_chat_sessions_eagerly_by_time( start=start, end=end, db_session=db_session, limit=limit ) + # this is VERY slow (80 seconds) due to create_chat_chain being called + # for each session. Needs optimizing. chat_session_snapshots = [ snapshot_from_chat_session(chat_session=chat_session, db_session=db_session) for chat_session in chat_sessions @@ -246,6 +251,8 @@ def get_query_history_as_csv( detail="Query history has been disabled by the administrator.", ) + # this call is very expensive and is timing out via endpoint + # TODO: optimize call and/or generate via background task complete_chat_session_history = fetch_and_process_chat_session_history( db_session=db_session, start=start or datetime.fromtimestamp(0, tz=timezone.utc), diff --git a/backend/onyx/db/seeding/chat_history_seeding.py b/backend/onyx/db/seeding/chat_history_seeding.py index ee0c558ae..f813802cd 100644 --- a/backend/onyx/db/seeding/chat_history_seeding.py +++ b/backend/onyx/db/seeding/chat_history_seeding.py @@ -1,6 +1,7 @@ import random from datetime import datetime from datetime import timedelta +from logging import getLogger from onyx.configs.constants import MessageType from onyx.db.chat import create_chat_session @@ -9,6 +10,8 @@ from onyx.db.chat import get_or_create_root_message from onyx.db.engine import get_session_with_current_tenant from onyx.db.models import ChatSession +logger = getLogger(__name__) + def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None: """Utility function to seed chat history for testing. @@ -19,12 +22,18 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None: the times. """ with get_session_with_current_tenant() as db_session: + logger.info(f"Seeding {num_sessions} sessions.") for y in range(0, num_sessions): create_chat_session(db_session, f"pytest_session_{y}", None, None) # randomize all session times + logger.info(f"Seeding {num_messages} messages per session.") rows = db_session.query(ChatSession).all() - for row in rows: + for x in range(0, len(rows)): + if x % 1024 == 0: + logger.info(f"Seeded messages for {x} sessions so far.") + + row = rows[x] row.time_created = datetime.utcnow() - timedelta( days=random.randint(0, days) ) @@ -34,20 +43,37 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None: root_message = get_or_create_root_message(row.id, db_session) + current_message_type = MessageType.USER + parent_message = root_message for x in range(0, num_messages): + if current_message_type == MessageType.USER: + msg = f"pytest_message_user_{x}" + else: + msg = f"pytest_message_assistant_{x}" + chat_message = create_new_chat_message( row.id, - root_message, - f"pytest_message_{x}", + parent_message, + msg, None, 0, - MessageType.USER, + current_message_type, db_session, ) chat_message.time_sent = row.time_created + timedelta( minutes=random.randint(0, 10) ) - db_session.commit() + + db_session.commit() + + current_message_type = ( + MessageType.ASSISTANT + if current_message_type == MessageType.USER + else MessageType.USER + ) + parent_message = chat_message db_session.commit() + + logger.info(f"Seeded messages for {len(rows)} sessions. Finished.") diff --git a/backend/tests/integration/tests/query_history/test_usage_reports.py b/backend/tests/integration/tests/query_history/test_usage_reports.py index 3fbe70e9c..5903ae6ec 100644 --- a/backend/tests/integration/tests/query_history/test_usage_reports.py +++ b/backend/tests/integration/tests/query_history/test_usage_reports.py @@ -10,7 +10,9 @@ from onyx.db.seeding.chat_history_seeding import seed_chat_history def test_usage_reports(reset: None) -> None: EXPECTED_SESSIONS = 2048 MESSAGES_PER_SESSION = 4 - EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION + + # divide by 2 because only messages of type USER are returned + EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION / 2 seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)