Bugfix/query history notes (#4204)

* early work in progress

* rename utility script

* move actual data seeding to a shareable function

* add test

* make the test pass with the fix

* fix comment

* slight improvements and notes to query history and seeding

* update test

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-03-07 11:52:30 -08:00 committed by GitHub
parent 61ccba82a9
commit 9217243e3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 43 additions and 6 deletions

View File

@ -27,6 +27,8 @@ def get_empty_chat_messages_entries__paginated(
first element is the most recent timestamp out of the sessions iterated first element is the most recent timestamp out of the sessions iterated
- this timestamp can be used to paginate forward in time - this timestamp can be used to paginate forward in time
second element is a list of messages belonging to all the sessions iterated second element is a list of messages belonging to all the sessions iterated
Only messages of type USER are returned
""" """
chat_sessions = fetch_chat_sessions_eagerly_by_time( chat_sessions = fetch_chat_sessions_eagerly_by_time(
start=period[0], start=period[0],

View File

@ -48,10 +48,15 @@ def fetch_and_process_chat_session_history(
feedback_type: QAFeedbackType | None, feedback_type: QAFeedbackType | None,
limit: int | None = 500, limit: int | None = 500,
) -> list[ChatSessionSnapshot]: ) -> list[ChatSessionSnapshot]:
# observed to be slow a scale of 8192 sessions and 4 messages per session
# this is a little slow (5 seconds)
chat_sessions = fetch_chat_sessions_eagerly_by_time( chat_sessions = fetch_chat_sessions_eagerly_by_time(
start=start, end=end, db_session=db_session, limit=limit start=start, end=end, db_session=db_session, limit=limit
) )
# this is VERY slow (80 seconds) due to create_chat_chain being called
# for each session. Needs optimizing.
chat_session_snapshots = [ chat_session_snapshots = [
snapshot_from_chat_session(chat_session=chat_session, db_session=db_session) snapshot_from_chat_session(chat_session=chat_session, db_session=db_session)
for chat_session in chat_sessions for chat_session in chat_sessions
@ -246,6 +251,8 @@ def get_query_history_as_csv(
detail="Query history has been disabled by the administrator.", detail="Query history has been disabled by the administrator.",
) )
# this call is very expensive and is timing out via endpoint
# TODO: optimize call and/or generate via background task
complete_chat_session_history = fetch_and_process_chat_session_history( complete_chat_session_history = fetch_and_process_chat_session_history(
db_session=db_session, db_session=db_session,
start=start or datetime.fromtimestamp(0, tz=timezone.utc), start=start or datetime.fromtimestamp(0, tz=timezone.utc),

View File

@ -1,6 +1,7 @@
import random import random
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from logging import getLogger
from onyx.configs.constants import MessageType from onyx.configs.constants import MessageType
from onyx.db.chat import create_chat_session from onyx.db.chat import create_chat_session
@ -9,6 +10,8 @@ from onyx.db.chat import get_or_create_root_message
from onyx.db.engine import get_session_with_current_tenant from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import ChatSession from onyx.db.models import ChatSession
logger = getLogger(__name__)
def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None: def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
"""Utility function to seed chat history for testing. """Utility function to seed chat history for testing.
@ -19,12 +22,18 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
the times. the times.
""" """
with get_session_with_current_tenant() as db_session: with get_session_with_current_tenant() as db_session:
logger.info(f"Seeding {num_sessions} sessions.")
for y in range(0, num_sessions): for y in range(0, num_sessions):
create_chat_session(db_session, f"pytest_session_{y}", None, None) create_chat_session(db_session, f"pytest_session_{y}", None, None)
# randomize all session times # randomize all session times
logger.info(f"Seeding {num_messages} messages per session.")
rows = db_session.query(ChatSession).all() rows = db_session.query(ChatSession).all()
for row in rows: for x in range(0, len(rows)):
if x % 1024 == 0:
logger.info(f"Seeded messages for {x} sessions so far.")
row = rows[x]
row.time_created = datetime.utcnow() - timedelta( row.time_created = datetime.utcnow() - timedelta(
days=random.randint(0, days) days=random.randint(0, days)
) )
@ -34,20 +43,37 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
root_message = get_or_create_root_message(row.id, db_session) root_message = get_or_create_root_message(row.id, db_session)
current_message_type = MessageType.USER
parent_message = root_message
for x in range(0, num_messages): for x in range(0, num_messages):
if current_message_type == MessageType.USER:
msg = f"pytest_message_user_{x}"
else:
msg = f"pytest_message_assistant_{x}"
chat_message = create_new_chat_message( chat_message = create_new_chat_message(
row.id, row.id,
root_message, parent_message,
f"pytest_message_{x}", msg,
None, None,
0, 0,
MessageType.USER, current_message_type,
db_session, db_session,
) )
chat_message.time_sent = row.time_created + timedelta( chat_message.time_sent = row.time_created + timedelta(
minutes=random.randint(0, 10) minutes=random.randint(0, 10)
) )
db_session.commit()
db_session.commit()
current_message_type = (
MessageType.ASSISTANT
if current_message_type == MessageType.USER
else MessageType.USER
)
parent_message = chat_message
db_session.commit() db_session.commit()
logger.info(f"Seeded messages for {len(rows)} sessions. Finished.")

View File

@ -10,7 +10,9 @@ from onyx.db.seeding.chat_history_seeding import seed_chat_history
def test_usage_reports(reset: None) -> None: def test_usage_reports(reset: None) -> None:
EXPECTED_SESSIONS = 2048 EXPECTED_SESSIONS = 2048
MESSAGES_PER_SESSION = 4 MESSAGES_PER_SESSION = 4
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION
# divide by 2 because only messages of type USER are returned
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION / 2
seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90) seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)