mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-07 21:20:39 +02:00
Feature/tenant reporting 2 (#4750)
* add more info * fix headers * add filename as param (merge) * db manager entry in launch template --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
23
.vscode/launch.template.jsonc
vendored
23
.vscode/launch.template.jsonc
vendored
@ -428,6 +428,29 @@
|
||||
"--filename",
|
||||
"generated/openapi.json",
|
||||
]
|
||||
},
|
||||
{
|
||||
// script to debug multi tenant db issues
|
||||
"name": "Onyx DB Manager (Top Chunks)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/debugging/onyx_db.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--password",
|
||||
"your_password_here",
|
||||
"--port",
|
||||
"5433",
|
||||
"--report",
|
||||
"top-chunks",
|
||||
"--filename",
|
||||
"generated/tenants_by_num_docs.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Debug React Web App in Chrome",
|
||||
|
@ -21,6 +21,7 @@ if True: # noqa: E402
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.engine import SqlEngine
|
||||
from onyx.db.models import Document
|
||||
from onyx.db.models import User
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
@ -30,6 +31,8 @@ if True: # noqa: E402
|
||||
|
||||
|
||||
class TenantMetadata(BaseModel):
|
||||
first_email: str | None
|
||||
user_count: int
|
||||
num_docs: int
|
||||
num_chunks: int
|
||||
|
||||
@ -39,7 +42,7 @@ class SQLAlchemyDebugging:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def top_chunks(self, k: int = 10) -> None:
|
||||
def top_chunks(self, filename: str, k: int = 10) -> None:
|
||||
tenants_to_total_chunks: dict[str, TenantMetadata] = {}
|
||||
|
||||
logger.info("Fetching all tenant id's.")
|
||||
@ -56,6 +59,14 @@ class SQLAlchemyDebugging:
|
||||
|
||||
try:
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
first_email = None
|
||||
|
||||
first_user = db_session.query(User).first()
|
||||
if first_user:
|
||||
first_email = first_user.email
|
||||
|
||||
user_count = db_session.query(User).count()
|
||||
|
||||
# Calculate the total number of document rows for the current tenant
|
||||
total_documents = db_session.query(Document).count()
|
||||
# marginally useful to skip some tenants ... maybe we can improve on this
|
||||
@ -69,15 +80,20 @@ class SQLAlchemyDebugging:
|
||||
total_chunks = db_session.query(
|
||||
func.sum(Document.chunk_count)
|
||||
).scalar()
|
||||
|
||||
total_chunks = total_chunks or 0
|
||||
|
||||
logger.info(
|
||||
f"{num_processed} of {num_tenant_ids}: Tenant '{tenant_id}': "
|
||||
f"first_email={first_email} user_count={user_count} "
|
||||
f"docs={total_documents} chunks={total_chunks}"
|
||||
)
|
||||
|
||||
tenants_to_total_chunks[tenant_id] = TenantMetadata(
|
||||
num_docs=total_documents, num_chunks=total_chunks
|
||||
first_email=first_email,
|
||||
user_count=user_count,
|
||||
num_docs=total_documents,
|
||||
num_chunks=total_chunks,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing tenant '{tenant_id}': {e}")
|
||||
@ -91,14 +107,23 @@ class SQLAlchemyDebugging:
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
csv_filename = "tenants_by_num_docs.csv"
|
||||
with open(csv_filename, "w") as csvfile:
|
||||
with open(filename, "w") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(["tenant_id", "num_docs", "num_chunks"]) # Write header
|
||||
writer.writerow(
|
||||
["tenant_id", "first_user_email", "num_user", "num_docs", "num_chunks"]
|
||||
) # Write header
|
||||
# Write data rows (using the sorted list)
|
||||
for tenant_id, metadata in sorted_tenants:
|
||||
writer.writerow([tenant_id, metadata.num_docs, metadata.num_chunks])
|
||||
logger.info(f"Successfully wrote statistics to {csv_filename}")
|
||||
writer.writerow(
|
||||
[
|
||||
tenant_id,
|
||||
metadata.first_email,
|
||||
metadata.user_count,
|
||||
metadata.num_docs,
|
||||
metadata.num_chunks,
|
||||
]
|
||||
)
|
||||
logger.info(f"Successfully wrote statistics to {filename}")
|
||||
|
||||
# output top k by chunks
|
||||
top_k_tenants = heapq.nlargest(
|
||||
@ -118,6 +143,14 @@ def main() -> None:
|
||||
|
||||
parser.add_argument("--report", help="Generate the given report")
|
||||
|
||||
parser.add_argument(
|
||||
"--filename",
|
||||
type=str,
|
||||
default="tenants_by_num_docs.csv",
|
||||
help="Generate the given report",
|
||||
required=False,
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info(f"{args}")
|
||||
@ -140,7 +173,7 @@ def main() -> None:
|
||||
debugger = SQLAlchemyDebugging()
|
||||
|
||||
if args.report == "top-chunks":
|
||||
debugger.top_chunks(10)
|
||||
debugger.top_chunks(args.filename, 10)
|
||||
else:
|
||||
logger.info("No action.")
|
||||
|
||||
|
Reference in New Issue
Block a user