Feature/tenant reporting 2 (#4750)

* add more info * fix headers * add filename as param (merge) * db manager entry in launch template --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-07-07 21:20:39 +02:00 · 2025-05-27 16:24:47 -07:00
parent baaf31513c
commit 16a14bac89
2 changed files with 64 additions and 8 deletions
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@ -428,6 +428,29 @@
        "--filename",
        "generated/openapi.json",
      ]
+    },
+    {
+      // script to debug multi tenant db issues
+      "name": "Onyx DB Manager (Top Chunks)",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "scripts/debugging/onyx_db.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.env",
+      "env": {
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "args": [
+        "--password",
+        "your_password_here",
+        "--port",
+        "5433",
+        "--report",
+        "top-chunks",
+        "--filename",
+        "generated/tenants_by_num_docs.csv"
+      ]
    },
      {
        "name": "Debug React Web App in Chrome",
--- a/backend/scripts/debugging/onyx_db.py
+++ b/backend/scripts/debugging/onyx_db.py
@ -21,6 +21,7 @@ if True:  # noqa: E402
    from onyx.db.engine import get_session_with_tenant
    from onyx.db.engine import SqlEngine
    from onyx.db.models import Document
+    from onyx.db.models import User
    from onyx.utils.logger import setup_logger
    from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR

@ -30,6 +31,8 @@ if True:  # noqa: E402


 class TenantMetadata(BaseModel):
+    first_email: str | None
+    user_count: int
    num_docs: int
    num_chunks: int

@ -39,7 +42,7 @@ class SQLAlchemyDebugging:
    def __init__(self) -> None:
        pass

-    def top_chunks(self, k: int = 10) -> None:
+    def top_chunks(self, filename: str, k: int = 10) -> None:
        tenants_to_total_chunks: dict[str, TenantMetadata] = {}

        logger.info("Fetching all tenant id's.")
@ -56,6 +59,14 @@ class SQLAlchemyDebugging:

            try:
                with get_session_with_tenant(tenant_id=tenant_id) as db_session:
+                    first_email = None
+
+                    first_user = db_session.query(User).first()
+                    if first_user:
+                        first_email = first_user.email
+
+                    user_count = db_session.query(User).count()
+
                    # Calculate the total number of document rows for the current tenant
                    total_documents = db_session.query(Document).count()
                    # marginally useful to skip some tenants ... maybe we can improve on this
@ -69,15 +80,20 @@ class SQLAlchemyDebugging:
                    total_chunks = db_session.query(
                        func.sum(Document.chunk_count)
                    ).scalar()
+
                    total_chunks = total_chunks or 0

                    logger.info(
                        f"{num_processed} of {num_tenant_ids}: Tenant '{tenant_id}': "
+                        f"first_email={first_email} user_count={user_count} "
                        f"docs={total_documents} chunks={total_chunks}"
                    )

                tenants_to_total_chunks[tenant_id] = TenantMetadata(
-                    num_docs=total_documents, num_chunks=total_chunks
+                    first_email=first_email,
+                    user_count=user_count,
+                    num_docs=total_documents,
+                    num_chunks=total_chunks,
                )
            except Exception as e:
                logger.error(f"Error processing tenant '{tenant_id}': {e}")
@ -91,14 +107,23 @@ class SQLAlchemyDebugging:
            reverse=True,
        )

-        csv_filename = "tenants_by_num_docs.csv"
-        with open(csv_filename, "w") as csvfile:
+        with open(filename, "w") as csvfile:
            writer = csv.writer(csvfile)
-            writer.writerow(["tenant_id", "num_docs", "num_chunks"])  # Write header
+            writer.writerow(
+                ["tenant_id", "first_user_email", "num_user", "num_docs", "num_chunks"]
+            )  # Write header
            # Write data rows (using the sorted list)
            for tenant_id, metadata in sorted_tenants:
-                writer.writerow([tenant_id, metadata.num_docs, metadata.num_chunks])
-            logger.info(f"Successfully wrote statistics to {csv_filename}")
+                writer.writerow(
+                    [
+                        tenant_id,
+                        metadata.first_email,
+                        metadata.user_count,
+                        metadata.num_docs,
+                        metadata.num_chunks,
+                    ]
+                )
+            logger.info(f"Successfully wrote statistics to {filename}")

        # output top k by chunks
        top_k_tenants = heapq.nlargest(
@ -118,6 +143,14 @@ def main() -> None:

    parser.add_argument("--report", help="Generate the given report")

+    parser.add_argument(
+        "--filename",
+        type=str,
+        default="tenants_by_num_docs.csv",
+        help="Generate the given report",
+        required=False,
+    )
+
    args = parser.parse_args()

    logger.info(f"{args}")
@ -140,7 +173,7 @@ def main() -> None:
    debugger = SQLAlchemyDebugging()

    if args.report == "top-chunks":
-        debugger.top_chunks(10)
+        debugger.top_chunks(args.filename, 10)
    else:
        logger.info("No action.")