Bugfix/query history notes (#4204)

* early work in progress

* rename utility script

* move actual data seeding to a shareable function

* add test

* make the test pass with the fix

* fix comment

* slight improvements and notes to query history and seeding

* update test

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-03-07 11:52:30 -08:00 committed by GitHub
parent 61ccba82a9
commit 9217243e3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 43 additions and 6 deletions

View File

@ -27,6 +27,8 @@ def get_empty_chat_messages_entries__paginated(
first element is the most recent timestamp out of the sessions iterated
- this timestamp can be used to paginate forward in time
second element is a list of messages belonging to all the sessions iterated
Only messages of type USER are returned
"""
chat_sessions = fetch_chat_sessions_eagerly_by_time(
start=period[0],

View File

@ -48,10 +48,15 @@ def fetch_and_process_chat_session_history(
feedback_type: QAFeedbackType | None,
limit: int | None = 500,
) -> list[ChatSessionSnapshot]:
# observed to be slow a scale of 8192 sessions and 4 messages per session
# this is a little slow (5 seconds)
chat_sessions = fetch_chat_sessions_eagerly_by_time(
start=start, end=end, db_session=db_session, limit=limit
)
# this is VERY slow (80 seconds) due to create_chat_chain being called
# for each session. Needs optimizing.
chat_session_snapshots = [
snapshot_from_chat_session(chat_session=chat_session, db_session=db_session)
for chat_session in chat_sessions
@ -246,6 +251,8 @@ def get_query_history_as_csv(
detail="Query history has been disabled by the administrator.",
)
# this call is very expensive and is timing out via endpoint
# TODO: optimize call and/or generate via background task
complete_chat_session_history = fetch_and_process_chat_session_history(
db_session=db_session,
start=start or datetime.fromtimestamp(0, tz=timezone.utc),

View File

@ -1,6 +1,7 @@
import random
from datetime import datetime
from datetime import timedelta
from logging import getLogger
from onyx.configs.constants import MessageType
from onyx.db.chat import create_chat_session
@ -9,6 +10,8 @@ from onyx.db.chat import get_or_create_root_message
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import ChatSession
logger = getLogger(__name__)
def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
"""Utility function to seed chat history for testing.
@ -19,12 +22,18 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
the times.
"""
with get_session_with_current_tenant() as db_session:
logger.info(f"Seeding {num_sessions} sessions.")
for y in range(0, num_sessions):
create_chat_session(db_session, f"pytest_session_{y}", None, None)
# randomize all session times
logger.info(f"Seeding {num_messages} messages per session.")
rows = db_session.query(ChatSession).all()
for row in rows:
for x in range(0, len(rows)):
if x % 1024 == 0:
logger.info(f"Seeded messages for {x} sessions so far.")
row = rows[x]
row.time_created = datetime.utcnow() - timedelta(
days=random.randint(0, days)
)
@ -34,20 +43,37 @@ def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
root_message = get_or_create_root_message(row.id, db_session)
current_message_type = MessageType.USER
parent_message = root_message
for x in range(0, num_messages):
if current_message_type == MessageType.USER:
msg = f"pytest_message_user_{x}"
else:
msg = f"pytest_message_assistant_{x}"
chat_message = create_new_chat_message(
row.id,
root_message,
f"pytest_message_{x}",
parent_message,
msg,
None,
0,
MessageType.USER,
current_message_type,
db_session,
)
chat_message.time_sent = row.time_created + timedelta(
minutes=random.randint(0, 10)
)
db_session.commit()
db_session.commit()
current_message_type = (
MessageType.ASSISTANT
if current_message_type == MessageType.USER
else MessageType.USER
)
parent_message = chat_message
db_session.commit()
logger.info(f"Seeded messages for {len(rows)} sessions. Finished.")

View File

@ -10,7 +10,9 @@ from onyx.db.seeding.chat_history_seeding import seed_chat_history
def test_usage_reports(reset: None) -> None:
EXPECTED_SESSIONS = 2048
MESSAGES_PER_SESSION = 4
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION
# divide by 2 because only messages of type USER are returned
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION / 2
seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)