diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc index 499d1b8ab2..b04c3e70be 100644 --- a/.vscode/launch.template.jsonc +++ b/.vscode/launch.template.jsonc @@ -428,6 +428,29 @@ "--filename", "generated/openapi.json", ] + }, + { + // script to debug multi tenant db issues + "name": "Onyx DB Manager (Top Chunks)", + "type": "debugpy", + "request": "launch", + "program": "scripts/debugging/onyx_db.py", + "cwd": "${workspaceFolder}/backend", + "envFile": "${workspaceFolder}/.env", + "env": { + "PYTHONUNBUFFERED": "1", + "PYTHONPATH": "." + }, + "args": [ + "--password", + "your_password_here", + "--port", + "5433", + "--report", + "top-chunks", + "--filename", + "generated/tenants_by_num_docs.csv" + ] }, { "name": "Debug React Web App in Chrome", diff --git a/backend/scripts/debugging/onyx_db.py b/backend/scripts/debugging/onyx_db.py index 4023449582..e4d42c2d81 100644 --- a/backend/scripts/debugging/onyx_db.py +++ b/backend/scripts/debugging/onyx_db.py @@ -21,6 +21,7 @@ if True: # noqa: E402 from onyx.db.engine import get_session_with_tenant from onyx.db.engine import SqlEngine from onyx.db.models import Document + from onyx.db.models import User from onyx.utils.logger import setup_logger from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR @@ -30,6 +31,8 @@ if True: # noqa: E402 class TenantMetadata(BaseModel): + first_email: str | None + user_count: int num_docs: int num_chunks: int @@ -39,7 +42,7 @@ class SQLAlchemyDebugging: def __init__(self) -> None: pass - def top_chunks(self, k: int = 10) -> None: + def top_chunks(self, filename: str, k: int = 10) -> None: tenants_to_total_chunks: dict[str, TenantMetadata] = {} logger.info("Fetching all tenant id's.") @@ -56,6 +59,14 @@ class SQLAlchemyDebugging: try: with get_session_with_tenant(tenant_id=tenant_id) as db_session: + first_email = None + + first_user = db_session.query(User).first() + if first_user: + first_email = first_user.email + + user_count = db_session.query(User).count() + # Calculate the total number of document rows for the current tenant total_documents = db_session.query(Document).count() # marginally useful to skip some tenants ... maybe we can improve on this @@ -69,15 +80,20 @@ class SQLAlchemyDebugging: total_chunks = db_session.query( func.sum(Document.chunk_count) ).scalar() + total_chunks = total_chunks or 0 logger.info( f"{num_processed} of {num_tenant_ids}: Tenant '{tenant_id}': " + f"first_email={first_email} user_count={user_count} " f"docs={total_documents} chunks={total_chunks}" ) tenants_to_total_chunks[tenant_id] = TenantMetadata( - num_docs=total_documents, num_chunks=total_chunks + first_email=first_email, + user_count=user_count, + num_docs=total_documents, + num_chunks=total_chunks, ) except Exception as e: logger.error(f"Error processing tenant '{tenant_id}': {e}") @@ -91,14 +107,23 @@ class SQLAlchemyDebugging: reverse=True, ) - csv_filename = "tenants_by_num_docs.csv" - with open(csv_filename, "w") as csvfile: + with open(filename, "w") as csvfile: writer = csv.writer(csvfile) - writer.writerow(["tenant_id", "num_docs", "num_chunks"]) # Write header + writer.writerow( + ["tenant_id", "first_user_email", "num_user", "num_docs", "num_chunks"] + ) # Write header # Write data rows (using the sorted list) for tenant_id, metadata in sorted_tenants: - writer.writerow([tenant_id, metadata.num_docs, metadata.num_chunks]) - logger.info(f"Successfully wrote statistics to {csv_filename}") + writer.writerow( + [ + tenant_id, + metadata.first_email, + metadata.user_count, + metadata.num_docs, + metadata.num_chunks, + ] + ) + logger.info(f"Successfully wrote statistics to {filename}") # output top k by chunks top_k_tenants = heapq.nlargest( @@ -118,6 +143,14 @@ def main() -> None: parser.add_argument("--report", help="Generate the given report") + parser.add_argument( + "--filename", + type=str, + default="tenants_by_num_docs.csv", + help="Generate the given report", + required=False, + ) + args = parser.parse_args() logger.info(f"{args}") @@ -140,7 +173,7 @@ def main() -> None: debugger = SQLAlchemyDebugging() if args.report == "top-chunks": - debugger.top_chunks(10) + debugger.top_chunks(args.filename, 10) else: logger.info("No action.")