Feature/tenant reporting 2 (#4750)

* add more info

* fix headers

* add filename as param (merge)

* db manager entry in launch template

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer
2025-05-27 16:24:47 -07:00
committed by GitHub
parent baaf31513c
commit 16a14bac89
2 changed files with 64 additions and 8 deletions

View File

@ -428,6 +428,29 @@
"--filename",
"generated/openapi.json",
]
},
{
// script to debug multi tenant db issues
"name": "Onyx DB Manager (Top Chunks)",
"type": "debugpy",
"request": "launch",
"program": "scripts/debugging/onyx_db.py",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.env",
"env": {
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"--password",
"your_password_here",
"--port",
"5433",
"--report",
"top-chunks",
"--filename",
"generated/tenants_by_num_docs.csv"
]
},
{
"name": "Debug React Web App in Chrome",

View File

@ -21,6 +21,7 @@ if True: # noqa: E402
from onyx.db.engine import get_session_with_tenant
from onyx.db.engine import SqlEngine
from onyx.db.models import Document
from onyx.db.models import User
from onyx.utils.logger import setup_logger
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
@ -30,6 +31,8 @@ if True: # noqa: E402
class TenantMetadata(BaseModel):
first_email: str | None
user_count: int
num_docs: int
num_chunks: int
@ -39,7 +42,7 @@ class SQLAlchemyDebugging:
def __init__(self) -> None:
pass
def top_chunks(self, k: int = 10) -> None:
def top_chunks(self, filename: str, k: int = 10) -> None:
tenants_to_total_chunks: dict[str, TenantMetadata] = {}
logger.info("Fetching all tenant id's.")
@ -56,6 +59,14 @@ class SQLAlchemyDebugging:
try:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
first_email = None
first_user = db_session.query(User).first()
if first_user:
first_email = first_user.email
user_count = db_session.query(User).count()
# Calculate the total number of document rows for the current tenant
total_documents = db_session.query(Document).count()
# marginally useful to skip some tenants ... maybe we can improve on this
@ -69,15 +80,20 @@ class SQLAlchemyDebugging:
total_chunks = db_session.query(
func.sum(Document.chunk_count)
).scalar()
total_chunks = total_chunks or 0
logger.info(
f"{num_processed} of {num_tenant_ids}: Tenant '{tenant_id}': "
f"first_email={first_email} user_count={user_count} "
f"docs={total_documents} chunks={total_chunks}"
)
tenants_to_total_chunks[tenant_id] = TenantMetadata(
num_docs=total_documents, num_chunks=total_chunks
first_email=first_email,
user_count=user_count,
num_docs=total_documents,
num_chunks=total_chunks,
)
except Exception as e:
logger.error(f"Error processing tenant '{tenant_id}': {e}")
@ -91,14 +107,23 @@ class SQLAlchemyDebugging:
reverse=True,
)
csv_filename = "tenants_by_num_docs.csv"
with open(csv_filename, "w") as csvfile:
with open(filename, "w") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["tenant_id", "num_docs", "num_chunks"]) # Write header
writer.writerow(
["tenant_id", "first_user_email", "num_user", "num_docs", "num_chunks"]
) # Write header
# Write data rows (using the sorted list)
for tenant_id, metadata in sorted_tenants:
writer.writerow([tenant_id, metadata.num_docs, metadata.num_chunks])
logger.info(f"Successfully wrote statistics to {csv_filename}")
writer.writerow(
[
tenant_id,
metadata.first_email,
metadata.user_count,
metadata.num_docs,
metadata.num_chunks,
]
)
logger.info(f"Successfully wrote statistics to {filename}")
# output top k by chunks
top_k_tenants = heapq.nlargest(
@ -118,6 +143,14 @@ def main() -> None:
parser.add_argument("--report", help="Generate the given report")
parser.add_argument(
"--filename",
type=str,
default="tenants_by_num_docs.csv",
help="Generate the given report",
required=False,
)
args = parser.parse_args()
logger.info(f"{args}")
@ -140,7 +173,7 @@ def main() -> None:
debugger = SQLAlchemyDebugging()
if args.report == "top-chunks":
debugger.top_chunks(10)
debugger.top_chunks(args.filename, 10)
else:
logger.info("No action.")