add user files (#4152)
775
.vscode/launch.template.jsonc
vendored
@ -6,396 +6,419 @@
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"compounds": [
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Compound ---",
|
||||
"configurations": [
|
||||
"--- Individual ---"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1",
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Run All Onyx Services",
|
||||
"configurations": [
|
||||
"Web Server",
|
||||
"Model Server",
|
||||
"API Server",
|
||||
"Slack Bot",
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1",
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web / Model / API",
|
||||
"configurations": [
|
||||
"Web Server",
|
||||
"Model Server",
|
||||
"API Server",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1",
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Celery (all)",
|
||||
"configurations": [
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1",
|
||||
}
|
||||
}
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Compound ---",
|
||||
"configurations": ["--- Individual ---"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Run All Onyx Services",
|
||||
"configurations": [
|
||||
"Web Server",
|
||||
"Model Server",
|
||||
"API Server",
|
||||
"Slack Bot",
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web / Model / API",
|
||||
"configurations": ["Web Server", "Model Server", "API Server"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Celery (all)",
|
||||
"configurations": [
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"configurations": [
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Individual ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web Server",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceRoot}/web",
|
||||
"runtimeExecutable": "npm",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"runtimeArgs": [
|
||||
"run", "dev"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"console": "integratedTerminal",
|
||||
"consoleTitle": "Web Server Console"
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Individual ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web Server",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceRoot}/web",
|
||||
"runtimeExecutable": "npm",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"runtimeArgs": ["run", "dev"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Model Server",
|
||||
"consoleName": "Model Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
"args": [
|
||||
"model_server.main:app",
|
||||
"--reload",
|
||||
"--port",
|
||||
"9000"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Model Server Console"
|
||||
"console": "integratedTerminal",
|
||||
"consoleTitle": "Web Server Console"
|
||||
},
|
||||
{
|
||||
"name": "Model Server",
|
||||
"consoleName": "Model Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
{
|
||||
"name": "API Server",
|
||||
"consoleName": "API Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
"args": [
|
||||
"onyx.main:app",
|
||||
"--reload",
|
||||
"--port",
|
||||
"8080"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "API Server Console"
|
||||
"args": ["model_server.main:app", "--reload", "--port", "9000"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
// For the listener to access the Slack API,
|
||||
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
{
|
||||
"name": "Slack Bot",
|
||||
"consoleName": "Slack Bot",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "onyx/onyxbot/slack/listener.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Slack Bot Console"
|
||||
"consoleTitle": "Model Server Console"
|
||||
},
|
||||
{
|
||||
"name": "API Server",
|
||||
"consoleName": "API Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
{
|
||||
"name": "Celery primary",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.primary",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=primary@%n",
|
||||
"-Q",
|
||||
"celery",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery primary Console"
|
||||
"args": ["onyx.main:app", "--reload", "--port", "8080"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Celery light",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.light",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=64",
|
||||
"--prefetch-multiplier=8",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert,checkpoint_cleanup",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery light Console"
|
||||
"consoleTitle": "API Server Console"
|
||||
},
|
||||
// For the listener to access the Slack API,
|
||||
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
{
|
||||
"name": "Slack Bot",
|
||||
"consoleName": "Slack Bot",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "onyx/onyxbot/slack/listener.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
{
|
||||
"name": "Celery heavy",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.heavy",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery heavy Console"
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Celery indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"ENABLE_MULTIPASS_INDEXING": "false",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=indexing@%n",
|
||||
"-Q",
|
||||
"connector_indexing",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery indexing Console"
|
||||
"consoleTitle": "Slack Bot Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery primary",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
{
|
||||
"name": "Celery monitoring",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.monitoring",
|
||||
"worker",
|
||||
"--pool=solo",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=monitoring@%n",
|
||||
"-Q",
|
||||
"monitoring",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery monitoring Console"
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.primary",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=primary@%n",
|
||||
"-Q",
|
||||
"celery"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Celery beat",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO",
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Celery beat Console"
|
||||
"consoleTitle": "Celery primary Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery light",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
{
|
||||
"name": "Pytest",
|
||||
"consoleName": "Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-v"
|
||||
// Specify a sepcific module/test to run or provide nothing to run all tests
|
||||
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
},
|
||||
"consoleTitle": "Pytest Console"
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.light",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=64",
|
||||
"--prefetch-multiplier=8",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Tasks ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "3",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clear and Restart External Volumes and Containers",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"stopOnEntry": true,
|
||||
"presentation": {
|
||||
"group": "3",
|
||||
},
|
||||
"consoleTitle": "Celery light Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery heavy",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
{
|
||||
// Celery jobs launched through a single background script (legacy)
|
||||
// Recommend using the "Celery (all)" compound launch instead.
|
||||
"name": "Background Jobs",
|
||||
"consoleName": "Background Jobs",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/dev_run_background_jobs.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.heavy",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Install Python Requirements",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"-c",
|
||||
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
"consoleTitle": "Celery heavy Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"ENABLE_MULTIPASS_INDEXING": "false",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=indexing@%n",
|
||||
"-Q",
|
||||
"connector_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery monitoring",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.monitoring",
|
||||
"worker",
|
||||
"--pool=solo",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=monitoring@%n",
|
||||
"-Q",
|
||||
"monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery monitoring Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery beat",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery beat Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery user files indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=user_files_indexing@%n",
|
||||
"-Q",
|
||||
"user_files_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery user files indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Pytest",
|
||||
"consoleName": "Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-v"
|
||||
// Specify a sepcific module/test to run or provide nothing to run all tests
|
||||
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Pytest Console"
|
||||
},
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Tasks ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "3",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clear and Restart External Volumes and Containers",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"${workspaceFolder}/backend/scripts/restart_containers.sh"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"stopOnEntry": true,
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
// Celery jobs launched through a single background script (legacy)
|
||||
// Recommend using the "Celery (all)" compound launch instead.
|
||||
"name": "Background Jobs",
|
||||
"consoleName": "Background Jobs",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/dev_run_background_jobs.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Install Python Requirements",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"-c",
|
||||
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Debug React Web App in Chrome",
|
||||
"type": "chrome",
|
||||
"request": "launch",
|
||||
"url": "http://localhost:3000",
|
||||
"webRoot": "${workspaceFolder}/web"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""enable contextual retrieval
|
||||
|
||||
Revision ID: 8e1ac4f39a9f
|
||||
Revises: 3781a5eb12cb
|
||||
Revises: 9aadf32dfeb4
|
||||
Create Date: 2024-12-20 13:29:09.918661
|
||||
|
||||
"""
|
||||
@ -11,7 +11,7 @@ import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "8e1ac4f39a9f"
|
||||
down_revision = "3781a5eb12cb"
|
||||
down_revision = "9aadf32dfeb4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
113
backend/alembic/versions/9aadf32dfeb4_add_user_files.py
Normal file
@ -0,0 +1,113 @@
|
||||
"""add user files
|
||||
|
||||
Revision ID: 9aadf32dfeb4
|
||||
Revises: 3781a5eb12cb
|
||||
Create Date: 2025-01-26 16:08:21.551022
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
import datetime
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "9aadf32dfeb4"
|
||||
down_revision = "3781a5eb12cb"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create user_folder table without parent_id
|
||||
op.create_table(
|
||||
"user_folder",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column("name", sa.String(length=255), nullable=True),
|
||||
sa.Column("description", sa.String(length=255), nullable=True),
|
||||
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
|
||||
),
|
||||
)
|
||||
|
||||
# Create user_file table with folder_id instead of parent_folder_id
|
||||
op.create_table(
|
||||
"user_file",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column(
|
||||
"folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("link_url", sa.String(), nullable=True),
|
||||
sa.Column("token_count", sa.Integer(), nullable=True),
|
||||
sa.Column("file_type", sa.String(), nullable=True),
|
||||
sa.Column("file_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("document_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("name", sa.String(length=255), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(),
|
||||
default=datetime.datetime.utcnow,
|
||||
),
|
||||
sa.Column(
|
||||
"cc_pair_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("connector_credential_pair.id"),
|
||||
nullable=True,
|
||||
unique=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_file table
|
||||
op.create_table(
|
||||
"persona__user_file",
|
||||
sa.Column(
|
||||
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
|
||||
),
|
||||
sa.Column(
|
||||
"user_file_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_file.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_folder table
|
||||
op.create_table(
|
||||
"persona__user_folder",
|
||||
sa.Column(
|
||||
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
|
||||
),
|
||||
sa.Column(
|
||||
"user_folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
|
||||
)
|
||||
|
||||
# Update existing records to have is_user_file=False instead of NULL
|
||||
op.execute(
|
||||
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the persona__user_folder table
|
||||
op.drop_table("persona__user_folder")
|
||||
# Drop the persona__user_file table
|
||||
op.drop_table("persona__user_file")
|
||||
# Drop the user_file table
|
||||
op.drop_table("user_file")
|
||||
# Drop the user_folder table
|
||||
op.drop_table("user_folder")
|
||||
op.drop_column("connector_credential_pair", "is_user_file")
|
BIN
backend/hello-vmlinux.bin
Normal file
@ -57,8 +57,9 @@ def _get_access_for_documents(
|
||||
db_session=db_session,
|
||||
document_ids=document_ids,
|
||||
)
|
||||
doc_access = {
|
||||
document_id: DocumentAccess.build(
|
||||
doc_access = {}
|
||||
for document_id, user_emails, is_public in document_access_info:
|
||||
doc_access[document_id] = DocumentAccess.build(
|
||||
user_emails=[email for email in user_emails if email],
|
||||
# MIT version will wipe all groups and external groups on update
|
||||
user_groups=[],
|
||||
@ -66,8 +67,6 @@ def _get_access_for_documents(
|
||||
external_user_emails=[],
|
||||
external_user_group_ids=[],
|
||||
)
|
||||
for document_id, user_emails, is_public in document_access_info
|
||||
}
|
||||
|
||||
# Sometimes the document has not been indexed by the indexing job yet, in those cases
|
||||
# the document does not exist and so we use least permissive. Specifically the EE version
|
||||
|
@ -321,8 +321,10 @@ def dispatch_separated(
|
||||
sep: str = DISPATCH_SEP_CHAR,
|
||||
) -> list[BaseMessage_Content]:
|
||||
num = 1
|
||||
accumulated_tokens = ""
|
||||
streamed_tokens: list[BaseMessage_Content] = []
|
||||
for token in tokens:
|
||||
accumulated_tokens += cast(str, token.content)
|
||||
content = cast(str, token.content)
|
||||
if sep in content:
|
||||
sub_question_parts = content.split(sep)
|
||||
|
@ -111,6 +111,7 @@ celery_app.autodiscover_tasks(
|
||||
"onyx.background.celery.tasks.vespa",
|
||||
"onyx.background.celery.tasks.connector_deletion",
|
||||
"onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"onyx.background.celery.tasks.user_file_folder_sync",
|
||||
"onyx.background.celery.tasks.indexing",
|
||||
"onyx.background.celery.tasks.tenant_provisioning",
|
||||
]
|
||||
|
@ -174,6 +174,9 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
f"search_settings={attempt.search_settings_id}"
|
||||
)
|
||||
logger.warning(failure_reason)
|
||||
logger.exception(
|
||||
f"Marking attempt {attempt.id} as canceled due to validation error 2"
|
||||
)
|
||||
mark_attempt_canceled(attempt.id, db_session, failure_reason)
|
||||
|
||||
|
||||
@ -285,5 +288,6 @@ celery_app.autodiscover_tasks(
|
||||
"onyx.background.celery.tasks.shared",
|
||||
"onyx.background.celery.tasks.vespa",
|
||||
"onyx.background.celery.tasks.llm_model_update",
|
||||
"onyx.background.celery.tasks.user_file_folder_sync",
|
||||
]
|
||||
)
|
||||
|
@ -64,6 +64,15 @@ beat_task_templates.extend(
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-for-user-file-folder-sync",
|
||||
"task": OnyxCeleryTask.CHECK_FOR_USER_FILE_FOLDER_SYNC,
|
||||
"schedule": timedelta(seconds=30),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-for-pruning",
|
||||
"task": OnyxCeleryTask.CHECK_FOR_PRUNING,
|
||||
|
@ -365,6 +365,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
Occcasionally does some validation of existing state to clear up error conditions"""
|
||||
|
||||
time_start = time.monotonic()
|
||||
task_logger.warning("check_for_indexing - Starting")
|
||||
|
||||
tasks_created = 0
|
||||
locked = False
|
||||
@ -433,7 +434,9 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
lock_beat.reacquire()
|
||||
cc_pair_ids: list[int] = []
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
cc_pairs = fetch_connector_credential_pairs(db_session)
|
||||
cc_pairs = fetch_connector_credential_pairs(
|
||||
db_session, include_user_files=True
|
||||
)
|
||||
for cc_pair_entry in cc_pairs:
|
||||
cc_pair_ids.append(cc_pair_entry.id)
|
||||
|
||||
@ -452,12 +455,18 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
not search_settings_instance.status.is_current()
|
||||
and not search_settings_instance.background_reindex_enabled
|
||||
):
|
||||
task_logger.warning("SKIPPING DUE TO NON-LIVE SEARCH SETTINGS")
|
||||
|
||||
continue
|
||||
|
||||
redis_connector_index = redis_connector.new_index(
|
||||
search_settings_instance.id
|
||||
)
|
||||
if redis_connector_index.fenced:
|
||||
task_logger.info(
|
||||
f"check_for_indexing - Skipping fenced connector: "
|
||||
f"cc_pair={cc_pair_id} search_settings={search_settings_instance.id}"
|
||||
)
|
||||
continue
|
||||
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
@ -465,6 +474,9 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
cc_pair_id=cc_pair_id,
|
||||
)
|
||||
if not cc_pair:
|
||||
task_logger.warning(
|
||||
f"check_for_indexing - CC pair not found: cc_pair={cc_pair_id}"
|
||||
)
|
||||
continue
|
||||
|
||||
last_attempt = get_last_attempt_for_cc_pair(
|
||||
@ -478,7 +490,20 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
secondary_index_building=len(search_settings_list) > 1,
|
||||
db_session=db_session,
|
||||
):
|
||||
task_logger.info(
|
||||
f"check_for_indexing - Not indexing cc_pair_id: {cc_pair_id} "
|
||||
f"search_settings={search_settings_instance.id}, "
|
||||
f"last_attempt={last_attempt.id if last_attempt else None}, "
|
||||
f"secondary_index_building={len(search_settings_list) > 1}"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
task_logger.info(
|
||||
f"check_for_indexing - Will index cc_pair_id: {cc_pair_id} "
|
||||
f"search_settings={search_settings_instance.id}, "
|
||||
f"last_attempt={last_attempt.id if last_attempt else None}, "
|
||||
f"secondary_index_building={len(search_settings_list) > 1}"
|
||||
)
|
||||
|
||||
reindex = False
|
||||
if search_settings_instance.status.is_current():
|
||||
@ -517,6 +542,12 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
f"search_settings={search_settings_instance.id}"
|
||||
)
|
||||
tasks_created += 1
|
||||
else:
|
||||
task_logger.info(
|
||||
f"Failed to create indexing task: "
|
||||
f"cc_pair={cc_pair.id} "
|
||||
f"search_settings={search_settings_instance.id}"
|
||||
)
|
||||
|
||||
lock_beat.reacquire()
|
||||
|
||||
@ -1149,6 +1180,9 @@ def connector_indexing_proxy_task(
|
||||
if result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL:
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
logger.exception(
|
||||
f"Marking attempt {index_attempt_id} as canceled due to termination signal"
|
||||
)
|
||||
mark_attempt_canceled(
|
||||
index_attempt_id,
|
||||
db_session,
|
||||
|
@ -371,6 +371,7 @@ def should_index(
|
||||
|
||||
# don't kick off indexing for `NOT_APPLICABLE` sources
|
||||
if connector.source == DocumentSource.NOT_APPLICABLE:
|
||||
print(f"Not indexing cc_pair={cc_pair.id}: NOT_APPLICABLE source")
|
||||
return False
|
||||
|
||||
# User can still manually create single indexing attempts via the UI for the
|
||||
@ -380,6 +381,9 @@ def should_index(
|
||||
search_settings_instance.status == IndexModelStatus.PRESENT
|
||||
and secondary_index_building
|
||||
):
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: DISABLE_INDEX_UPDATE_ON_SWAP is True and secondary index building"
|
||||
)
|
||||
return False
|
||||
|
||||
# When switching over models, always index at least once
|
||||
@ -388,19 +392,31 @@ def should_index(
|
||||
# No new index if the last index attempt succeeded
|
||||
# Once is enough. The model will never be able to swap otherwise.
|
||||
if last_index.status == IndexingStatus.SUCCESS:
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with successful last index attempt={last_index.id}"
|
||||
)
|
||||
return False
|
||||
|
||||
# No new index if the last index attempt is waiting to start
|
||||
if last_index.status == IndexingStatus.NOT_STARTED:
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with NOT_STARTED last index attempt={last_index.id}"
|
||||
)
|
||||
return False
|
||||
|
||||
# No new index if the last index attempt is running
|
||||
if last_index.status == IndexingStatus.IN_PROGRESS:
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with IN_PROGRESS last index attempt={last_index.id}"
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if (
|
||||
connector.id == 0 or connector.source == DocumentSource.INGESTION_API
|
||||
): # Ingestion API
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with Ingestion API source"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
@ -412,6 +428,9 @@ def should_index(
|
||||
or connector.id == 0
|
||||
or connector.source == DocumentSource.INGESTION_API
|
||||
):
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: Connector is paused or is Ingestion API"
|
||||
)
|
||||
return False
|
||||
|
||||
if search_settings_instance.status.is_current():
|
||||
@ -424,11 +443,16 @@ def should_index(
|
||||
return True
|
||||
|
||||
if connector.refresh_freq is None:
|
||||
print(f"Not indexing cc_pair={cc_pair.id}: refresh_freq is None")
|
||||
return False
|
||||
|
||||
current_db_time = get_db_current_time(db_session)
|
||||
time_since_index = current_db_time - last_index.time_updated
|
||||
if time_since_index.total_seconds() < connector.refresh_freq:
|
||||
print(
|
||||
f"Not indexing cc_pair={cc_pair.id}: Last index attempt={last_index.id} "
|
||||
f"too recent ({time_since_index.total_seconds()}s < {connector.refresh_freq}s)"
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
@ -508,6 +532,13 @@ def try_creating_indexing_task(
|
||||
|
||||
custom_task_id = redis_connector_index.generate_generator_task_id()
|
||||
|
||||
# Determine which queue to use based on whether this is a user file
|
||||
queue = (
|
||||
OnyxCeleryQueues.USER_FILES_INDEXING
|
||||
if cc_pair.is_user_file
|
||||
else OnyxCeleryQueues.CONNECTOR_INDEXING
|
||||
)
|
||||
|
||||
# when the task is sent, we have yet to finish setting up the fence
|
||||
# therefore, the task must contain code that blocks until the fence is ready
|
||||
result = celery_app.send_task(
|
||||
@ -518,7 +549,7 @@ def try_creating_indexing_task(
|
||||
search_settings_id=search_settings.id,
|
||||
tenant_id=tenant_id,
|
||||
),
|
||||
queue=OnyxCeleryQueues.CONNECTOR_INDEXING,
|
||||
queue=queue,
|
||||
task_id=custom_task_id,
|
||||
priority=OnyxCeleryPriority.MEDIUM,
|
||||
)
|
||||
|
@ -6,6 +6,7 @@ from tenacity import wait_random_exponential
|
||||
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.document_index.interfaces import VespaDocumentUserFields
|
||||
|
||||
|
||||
class RetryDocumentIndex:
|
||||
@ -52,11 +53,13 @@ class RetryDocumentIndex:
|
||||
*,
|
||||
tenant_id: str,
|
||||
chunk_count: int | None,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
) -> int:
|
||||
return self.index.update_single(
|
||||
doc_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
fields=fields,
|
||||
user_fields=user_fields,
|
||||
)
|
||||
|
@ -164,6 +164,7 @@ def document_by_cc_pair_cleanup_task(
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
|
||||
# there are still other cc_pair references to the doc, so just resync to Vespa
|
||||
|
@ -0,0 +1,266 @@
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from redis.lock import Lock as RedisLock
|
||||
from sqlalchemy.orm import Session
|
||||
from tenacity import RetryError
|
||||
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
|
||||
from onyx.background.celery.tasks.shared.tasks import LIGHT_SOFT_TIME_LIMIT
|
||||
from onyx.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT
|
||||
from onyx.background.celery.tasks.shared.tasks import OnyxCeleryTaskCompletionStatus
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.connector_credential_pair import (
|
||||
get_connector_credential_pairs_with_user_files,
|
||||
)
|
||||
from onyx.db.document import get_document
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.models import Document
|
||||
from onyx.db.models import DocumentByConnectorCredentialPair
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.user_documents import fetch_user_files_for_documents
|
||||
from onyx.db.user_documents import fetch_user_folders_for_documents
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.interfaces import VespaDocumentUserFields
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CHECK_FOR_USER_FILE_FOLDER_SYNC,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
trail=False,
|
||||
bind=True,
|
||||
)
|
||||
def check_for_user_file_folder_sync(self: Task, *, tenant_id: str) -> bool | None:
|
||||
"""Runs periodically to check for documents that need user file folder metadata updates.
|
||||
This task fetches all connector credential pairs with user files, gets the documents
|
||||
associated with them, and updates the user file and folder metadata in Vespa.
|
||||
"""
|
||||
|
||||
time_start = time.monotonic()
|
||||
|
||||
r = get_redis_client()
|
||||
|
||||
lock_beat: RedisLock = r.lock(
|
||||
OnyxRedisLocks.CHECK_USER_FILE_FOLDER_SYNC_BEAT_LOCK,
|
||||
timeout=CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT,
|
||||
)
|
||||
|
||||
# these tasks should never overlap
|
||||
if not lock_beat.acquire(blocking=False):
|
||||
return None
|
||||
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# Get all connector credential pairs that have user files
|
||||
cc_pairs = get_connector_credential_pairs_with_user_files(db_session)
|
||||
|
||||
if not cc_pairs:
|
||||
task_logger.info("No connector credential pairs with user files found")
|
||||
return True
|
||||
|
||||
# Get all documents associated with these cc_pairs
|
||||
document_ids = get_documents_for_cc_pairs(cc_pairs, db_session)
|
||||
|
||||
if not document_ids:
|
||||
task_logger.info(
|
||||
"No documents found for connector credential pairs with user files"
|
||||
)
|
||||
return True
|
||||
|
||||
# Fetch current user file and folder IDs for these documents
|
||||
doc_id_to_user_file_id = fetch_user_files_for_documents(
|
||||
document_ids=document_ids, db_session=db_session
|
||||
)
|
||||
doc_id_to_user_folder_id = fetch_user_folders_for_documents(
|
||||
document_ids=document_ids, db_session=db_session
|
||||
)
|
||||
|
||||
# Update Vespa metadata for each document
|
||||
for doc_id in document_ids:
|
||||
user_file_id = doc_id_to_user_file_id.get(doc_id)
|
||||
user_folder_id = doc_id_to_user_folder_id.get(doc_id)
|
||||
|
||||
if user_file_id is not None or user_folder_id is not None:
|
||||
# Schedule a task to update the document metadata
|
||||
update_user_file_folder_metadata.apply_async(
|
||||
args=(doc_id,), # Use tuple instead of list for args
|
||||
kwargs={
|
||||
"tenant_id": tenant_id,
|
||||
"user_file_id": user_file_id,
|
||||
"user_folder_id": user_folder_id,
|
||||
},
|
||||
queue="vespa_metadata_sync",
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
f"Scheduled metadata updates for {len(document_ids)} documents. "
|
||||
f"Elapsed time: {time.monotonic() - time_start:.2f}s"
|
||||
)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
task_logger.exception(f"Error in check_for_user_file_folder_sync: {e}")
|
||||
return False
|
||||
finally:
|
||||
lock_beat.release()
|
||||
|
||||
|
||||
def get_documents_for_cc_pairs(
|
||||
cc_pairs: List[ConnectorCredentialPair], db_session: Session
|
||||
) -> List[str]:
|
||||
"""Get all document IDs associated with the given connector credential pairs."""
|
||||
if not cc_pairs:
|
||||
return []
|
||||
|
||||
cc_pair_ids = [cc_pair.id for cc_pair in cc_pairs]
|
||||
|
||||
# Query to get document IDs from DocumentByConnectorCredentialPair
|
||||
# Note: DocumentByConnectorCredentialPair uses connector_id and credential_id, not cc_pair_id
|
||||
doc_cc_pairs = (
|
||||
db_session.query(Document.id)
|
||||
.join(
|
||||
DocumentByConnectorCredentialPair,
|
||||
Document.id == DocumentByConnectorCredentialPair.id,
|
||||
)
|
||||
.filter(
|
||||
db_session.query(ConnectorCredentialPair)
|
||||
.filter(
|
||||
ConnectorCredentialPair.id.in_(cc_pair_ids),
|
||||
ConnectorCredentialPair.connector_id
|
||||
== DocumentByConnectorCredentialPair.connector_id,
|
||||
ConnectorCredentialPair.credential_id
|
||||
== DocumentByConnectorCredentialPair.credential_id,
|
||||
)
|
||||
.exists()
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
return [doc_id for (doc_id,) in doc_cc_pairs]
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.UPDATE_USER_FILE_FOLDER_METADATA,
|
||||
bind=True,
|
||||
soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
|
||||
time_limit=LIGHT_TIME_LIMIT,
|
||||
max_retries=3,
|
||||
)
|
||||
def update_user_file_folder_metadata(
|
||||
self: Task,
|
||||
document_id: str,
|
||||
*,
|
||||
tenant_id: str,
|
||||
user_file_id: int | None,
|
||||
user_folder_id: int | None,
|
||||
) -> bool:
|
||||
"""Updates the user file and folder metadata for a document in Vespa."""
|
||||
start = time.monotonic()
|
||||
completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED
|
||||
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
)
|
||||
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
|
||||
doc = get_document(document_id, db_session)
|
||||
if not doc:
|
||||
elapsed = time.monotonic() - start
|
||||
task_logger.info(
|
||||
f"doc={document_id} "
|
||||
f"action=no_operation "
|
||||
f"elapsed={elapsed:.2f}"
|
||||
)
|
||||
completion_status = OnyxCeleryTaskCompletionStatus.SKIPPED
|
||||
return False
|
||||
|
||||
# Create user fields object with file and folder IDs
|
||||
user_fields = VespaDocumentUserFields(
|
||||
user_file_id=str(user_file_id) if user_file_id is not None else None,
|
||||
user_folder_id=str(user_folder_id)
|
||||
if user_folder_id is not None
|
||||
else None,
|
||||
)
|
||||
|
||||
# Update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
||||
chunks_affected = retry_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=None, # We're only updating user fields
|
||||
user_fields=user_fields,
|
||||
)
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
task_logger.info(
|
||||
f"doc={document_id} "
|
||||
f"action=user_file_folder_sync "
|
||||
f"user_file_id={user_file_id} "
|
||||
f"user_folder_id={user_folder_id} "
|
||||
f"chunks={chunks_affected} "
|
||||
f"elapsed={elapsed:.2f}"
|
||||
)
|
||||
completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
|
||||
return True
|
||||
|
||||
except SoftTimeLimitExceeded:
|
||||
task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}")
|
||||
completion_status = OnyxCeleryTaskCompletionStatus.SOFT_TIME_LIMIT
|
||||
except Exception as ex:
|
||||
e: Exception | None = None
|
||||
while True:
|
||||
if isinstance(ex, RetryError):
|
||||
task_logger.warning(
|
||||
f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
|
||||
)
|
||||
|
||||
# only set the inner exception if it is of type Exception
|
||||
e_temp = ex.last_attempt.exception()
|
||||
if isinstance(e_temp, Exception):
|
||||
e = e_temp
|
||||
else:
|
||||
e = ex
|
||||
|
||||
task_logger.exception(
|
||||
f"update_user_file_folder_metadata exceptioned: doc={document_id}"
|
||||
)
|
||||
|
||||
completion_status = OnyxCeleryTaskCompletionStatus.RETRYABLE_EXCEPTION
|
||||
if (
|
||||
self.max_retries is not None
|
||||
and self.request.retries >= self.max_retries
|
||||
):
|
||||
completion_status = (
|
||||
OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
|
||||
)
|
||||
|
||||
# Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64
|
||||
countdown = 2 ** (self.request.retries + 4)
|
||||
self.retry(exc=e, countdown=countdown) # this will raise a celery exception
|
||||
break # we won't hit this, but it looks weird not to have it
|
||||
finally:
|
||||
task_logger.info(
|
||||
f"update_user_file_folder_metadata completed: status={completion_status.value} doc={document_id}"
|
||||
)
|
||||
|
||||
return False
|
@ -573,6 +573,7 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
|
||||
# update db last. Worst case = we crash right before this and
|
||||
|
@ -274,7 +274,6 @@ def _run_indexing(
|
||||
"Search settings must be set for indexing. This should not be possible."
|
||||
)
|
||||
|
||||
# search_settings = index_attempt_start.search_settings
|
||||
db_connector = index_attempt_start.connector_credential_pair.connector
|
||||
db_credential = index_attempt_start.connector_credential_pair.credential
|
||||
ctx = RunIndexingContext(
|
||||
@ -638,6 +637,9 @@ def _run_indexing(
|
||||
# and mark the CCPair as invalid. This prevents the connector from being
|
||||
# used in the future until the credentials are updated.
|
||||
with get_session_with_current_tenant() as db_session_temp:
|
||||
logger.exception(
|
||||
f"Marking attempt {index_attempt_id} as canceled due to validation error."
|
||||
)
|
||||
mark_attempt_canceled(
|
||||
index_attempt_id,
|
||||
db_session_temp,
|
||||
@ -684,6 +686,9 @@ def _run_indexing(
|
||||
|
||||
elif isinstance(e, ConnectorStopSignal):
|
||||
with get_session_with_current_tenant() as db_session_temp:
|
||||
logger.exception(
|
||||
f"Marking attempt {index_attempt_id} as canceled due to stop signal."
|
||||
)
|
||||
mark_attempt_canceled(
|
||||
index_attempt_id,
|
||||
db_session_temp,
|
||||
@ -746,6 +751,7 @@ def _run_indexing(
|
||||
f"Connector succeeded: "
|
||||
f"docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
|
||||
)
|
||||
|
||||
else:
|
||||
mark_attempt_partially_succeeded(index_attempt_id, db_session_temp)
|
||||
logger.info(
|
||||
|
@ -127,6 +127,10 @@ class StreamStopInfo(SubQuestionIdentifier):
|
||||
return data
|
||||
|
||||
|
||||
class UserKnowledgeFilePacket(BaseModel):
|
||||
user_files: list[FileDescriptor]
|
||||
|
||||
|
||||
class LLMRelevanceFilterResponse(BaseModel):
|
||||
llm_selected_doc_indices: list[int]
|
||||
|
||||
|
@ -36,6 +36,7 @@ from onyx.chat.models import StreamingError
|
||||
from onyx.chat.models import StreamStopInfo
|
||||
from onyx.chat.models import StreamStopReason
|
||||
from onyx.chat.models import SubQuestionKey
|
||||
from onyx.chat.models import UserKnowledgeFilePacket
|
||||
from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
|
||||
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
|
||||
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
|
||||
@ -51,6 +52,7 @@ from onyx.context.search.enums import LLMEvaluationType
|
||||
from onyx.context.search.enums import OptionalSearchSetting
|
||||
from onyx.context.search.enums import QueryFlow
|
||||
from onyx.context.search.enums import SearchType
|
||||
from onyx.context.search.models import BaseFilters
|
||||
from onyx.context.search.models import InferenceSection
|
||||
from onyx.context.search.models import RetrievalDetails
|
||||
from onyx.context.search.models import SearchRequest
|
||||
@ -64,6 +66,7 @@ from onyx.context.search.utils import relevant_sections_to_indices
|
||||
from onyx.db.chat import attach_files_to_chat_message
|
||||
from onyx.db.chat import create_db_search_doc
|
||||
from onyx.db.chat import create_new_chat_message
|
||||
from onyx.db.chat import create_search_doc_from_user_file
|
||||
from onyx.db.chat import get_chat_message
|
||||
from onyx.db.chat import get_chat_session_by_id
|
||||
from onyx.db.chat import get_db_search_doc_by_id
|
||||
@ -80,12 +83,16 @@ from onyx.db.milestone import update_user_assistant_milestone
|
||||
from onyx.db.models import SearchDoc as DbSearchDoc
|
||||
from onyx.db.models import ToolCall
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.persona import get_persona_by_id
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.file_store.models import ChatFileType
|
||||
from onyx.file_store.models import FileDescriptor
|
||||
from onyx.file_store.models import InMemoryChatFile
|
||||
from onyx.file_store.utils import load_all_chat_files
|
||||
from onyx.file_store.utils import load_all_user_file_files
|
||||
from onyx.file_store.utils import load_all_user_files
|
||||
from onyx.file_store.utils import save_files
|
||||
from onyx.llm.exceptions import GenAIDisabledException
|
||||
from onyx.llm.factory import get_llms_for_persona
|
||||
@ -98,6 +105,7 @@ from onyx.server.query_and_chat.models import ChatMessageDetail
|
||||
from onyx.server.query_and_chat.models import CreateChatMessageRequest
|
||||
from onyx.server.utils import get_json_line
|
||||
from onyx.tools.force import ForceUseTool
|
||||
from onyx.tools.models import SearchToolOverrideKwargs
|
||||
from onyx.tools.models import ToolResponse
|
||||
from onyx.tools.tool import Tool
|
||||
from onyx.tools.tool_constructor import construct_tools
|
||||
@ -175,11 +183,14 @@ def _handle_search_tool_response_summary(
|
||||
db_session: Session,
|
||||
selected_search_docs: list[DbSearchDoc] | None,
|
||||
dedupe_docs: bool = False,
|
||||
user_files: list[UserFile] | None = None,
|
||||
loaded_user_files: list[InMemoryChatFile] | None = None,
|
||||
) -> tuple[QADocsResponse, list[DbSearchDoc], list[int] | None]:
|
||||
response_sumary = cast(SearchResponseSummary, packet.response)
|
||||
|
||||
is_extended = isinstance(packet, ExtendedToolResponse)
|
||||
dropped_inds = None
|
||||
|
||||
if not selected_search_docs:
|
||||
top_docs = chunks_or_sections_to_search_docs(response_sumary.top_sections)
|
||||
|
||||
@ -193,9 +204,31 @@ def _handle_search_tool_response_summary(
|
||||
create_db_search_doc(server_search_doc=doc, db_session=db_session)
|
||||
for doc in deduped_docs
|
||||
]
|
||||
|
||||
else:
|
||||
reference_db_search_docs = selected_search_docs
|
||||
|
||||
doc_ids = {doc.id for doc in reference_db_search_docs}
|
||||
if user_files is not None:
|
||||
for user_file in user_files:
|
||||
if user_file.id not in doc_ids:
|
||||
associated_chat_file = None
|
||||
if loaded_user_files is not None:
|
||||
associated_chat_file = next(
|
||||
(
|
||||
file
|
||||
for file in loaded_user_files
|
||||
if file.file_id == str(user_file.file_id)
|
||||
),
|
||||
None,
|
||||
)
|
||||
# Use create_search_doc_from_user_file to properly add the document to the database
|
||||
if associated_chat_file is not None:
|
||||
db_doc = create_search_doc_from_user_file(
|
||||
user_file, associated_chat_file, db_session
|
||||
)
|
||||
reference_db_search_docs.append(db_doc)
|
||||
|
||||
response_docs = [
|
||||
translate_db_search_doc_to_server_search_doc(db_search_doc)
|
||||
for db_search_doc in reference_db_search_docs
|
||||
@ -253,7 +286,10 @@ def _handle_internet_search_tool_response_summary(
|
||||
|
||||
|
||||
def _get_force_search_settings(
|
||||
new_msg_req: CreateChatMessageRequest, tools: list[Tool]
|
||||
new_msg_req: CreateChatMessageRequest,
|
||||
tools: list[Tool],
|
||||
user_file_ids: list[int],
|
||||
user_folder_ids: list[int],
|
||||
) -> ForceUseTool:
|
||||
internet_search_available = any(
|
||||
isinstance(tool, InternetSearchTool) for tool in tools
|
||||
@ -261,8 +297,11 @@ def _get_force_search_settings(
|
||||
search_tool_available = any(isinstance(tool, SearchTool) for tool in tools)
|
||||
|
||||
if not internet_search_available and not search_tool_available:
|
||||
# Does not matter much which tool is set here as force is false and neither tool is available
|
||||
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
|
||||
if new_msg_req.force_user_file_search:
|
||||
return ForceUseTool(force_use=True, tool_name=SearchTool._NAME)
|
||||
else:
|
||||
# Does not matter much which tool is set here as force is false and neither tool is available
|
||||
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
|
||||
|
||||
tool_name = SearchTool._NAME if search_tool_available else InternetSearchTool._NAME
|
||||
# Currently, the internet search tool does not support query override
|
||||
@ -272,12 +311,25 @@ def _get_force_search_settings(
|
||||
else None
|
||||
)
|
||||
|
||||
# Create override_kwargs for the search tool if user_file_ids are provided
|
||||
override_kwargs = None
|
||||
if (user_file_ids or user_folder_ids) and tool_name == SearchTool._NAME:
|
||||
override_kwargs = SearchToolOverrideKwargs(
|
||||
force_no_rerank=False,
|
||||
alternate_db_session=None,
|
||||
retrieved_sections_callback=None,
|
||||
skip_query_analysis=False,
|
||||
user_file_ids=user_file_ids,
|
||||
user_folder_ids=user_folder_ids,
|
||||
)
|
||||
|
||||
if new_msg_req.file_descriptors:
|
||||
# If user has uploaded files they're using, don't run any of the search tools
|
||||
return ForceUseTool(force_use=False, tool_name=tool_name)
|
||||
|
||||
should_force_search = any(
|
||||
[
|
||||
new_msg_req.force_user_file_search,
|
||||
new_msg_req.retrieval_options
|
||||
and new_msg_req.retrieval_options.run_search
|
||||
== OptionalSearchSetting.ALWAYS,
|
||||
@ -290,9 +342,17 @@ def _get_force_search_settings(
|
||||
if should_force_search:
|
||||
# If we are using selected docs, just put something here so the Tool doesn't need to build its own args via an LLM call
|
||||
args = {"query": new_msg_req.message} if new_msg_req.search_doc_ids else args
|
||||
return ForceUseTool(force_use=True, tool_name=tool_name, args=args)
|
||||
|
||||
return ForceUseTool(force_use=False, tool_name=tool_name, args=args)
|
||||
return ForceUseTool(
|
||||
force_use=True,
|
||||
tool_name=tool_name,
|
||||
args=args,
|
||||
override_kwargs=override_kwargs,
|
||||
)
|
||||
|
||||
return ForceUseTool(
|
||||
force_use=False, tool_name=tool_name, args=args, override_kwargs=override_kwargs
|
||||
)
|
||||
|
||||
|
||||
ChatPacket = (
|
||||
@ -311,6 +371,7 @@ ChatPacket = (
|
||||
| AgenticMessageResponseIDInfo
|
||||
| StreamStopInfo
|
||||
| AgentSearchPacket
|
||||
| UserKnowledgeFilePacket
|
||||
)
|
||||
ChatPacketStream = Iterator[ChatPacket]
|
||||
|
||||
@ -356,6 +417,10 @@ def stream_chat_message_objects(
|
||||
llm: LLM
|
||||
|
||||
try:
|
||||
# Move these variables inside the try block
|
||||
file_id_to_user_file = {}
|
||||
ordered_user_files = None
|
||||
|
||||
user_id = user.id if user is not None else None
|
||||
|
||||
chat_session = get_chat_session_by_id(
|
||||
@ -535,6 +600,70 @@ def stream_chat_message_objects(
|
||||
)
|
||||
req_file_ids = [f["id"] for f in new_msg_req.file_descriptors]
|
||||
latest_query_files = [file for file in files if file.file_id in req_file_ids]
|
||||
user_file_ids = new_msg_req.user_file_ids or []
|
||||
user_folder_ids = new_msg_req.user_folder_ids or []
|
||||
|
||||
if persona.user_files:
|
||||
for file in persona.user_files:
|
||||
user_file_ids.append(file.id)
|
||||
if persona.user_folders:
|
||||
for folder in persona.user_folders:
|
||||
user_folder_ids.append(folder.id)
|
||||
|
||||
# Initialize flag for user file search
|
||||
use_search_for_user_files = False
|
||||
|
||||
user_files: list[InMemoryChatFile] | None = None
|
||||
search_for_ordering_only = False
|
||||
user_file_files: list[UserFile] | None = None
|
||||
if user_file_ids or user_folder_ids:
|
||||
# Load user files
|
||||
user_files = load_all_user_files(
|
||||
user_file_ids or [],
|
||||
user_folder_ids or [],
|
||||
db_session,
|
||||
)
|
||||
user_file_files = load_all_user_file_files(
|
||||
user_file_ids or [],
|
||||
user_folder_ids or [],
|
||||
db_session,
|
||||
)
|
||||
# Store mapping of file_id to file for later reordering
|
||||
if user_files:
|
||||
file_id_to_user_file = {file.file_id: file for file in user_files}
|
||||
|
||||
# Calculate token count for the files
|
||||
from onyx.db.user_documents import calculate_user_files_token_count
|
||||
from onyx.chat.prompt_builder.citations_prompt import (
|
||||
compute_max_document_tokens_for_persona,
|
||||
)
|
||||
|
||||
total_tokens = calculate_user_files_token_count(
|
||||
user_file_ids or [],
|
||||
user_folder_ids or [],
|
||||
db_session,
|
||||
)
|
||||
|
||||
# Calculate available tokens for documents based on prompt, user input, etc.
|
||||
available_tokens = compute_max_document_tokens_for_persona(
|
||||
db_session=db_session,
|
||||
persona=persona,
|
||||
actual_user_input=message_text, # Use the actual user message
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Total file tokens: {total_tokens}, Available tokens: {available_tokens}"
|
||||
)
|
||||
|
||||
# ALWAYS use search for user files, but track if we need it for context or just ordering
|
||||
use_search_for_user_files = True
|
||||
# If files are small enough for context, we'll just use search for ordering
|
||||
search_for_ordering_only = total_tokens <= available_tokens
|
||||
|
||||
if search_for_ordering_only:
|
||||
# Add original user files to context since they fit
|
||||
if user_files:
|
||||
latest_query_files.extend(user_files)
|
||||
|
||||
if user_message:
|
||||
attach_files_to_chat_message(
|
||||
@ -677,8 +806,10 @@ def stream_chat_message_objects(
|
||||
prompt_config=prompt_config,
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
user_knowledge_present=bool(user_files or user_folder_ids),
|
||||
llm=llm,
|
||||
fast_llm=fast_llm,
|
||||
use_file_search=new_msg_req.force_user_file_search,
|
||||
search_tool_config=SearchToolConfig(
|
||||
answer_style_config=answer_style_config,
|
||||
document_pruning_config=document_pruning_config,
|
||||
@ -708,17 +839,138 @@ def stream_chat_message_objects(
|
||||
for tool_list in tool_dict.values():
|
||||
tools.extend(tool_list)
|
||||
|
||||
force_use_tool = _get_force_search_settings(
|
||||
new_msg_req, tools, user_file_ids, user_folder_ids
|
||||
)
|
||||
|
||||
# Set force_use if user files exceed token limit
|
||||
if use_search_for_user_files:
|
||||
try:
|
||||
# Check if search tool is available in the tools list
|
||||
search_tool_available = any(
|
||||
isinstance(tool, SearchTool) for tool in tools
|
||||
)
|
||||
|
||||
# If no search tool is available, add one
|
||||
if not search_tool_available:
|
||||
logger.info("No search tool available, creating one for user files")
|
||||
# Create a basic search tool config
|
||||
search_tool_config = SearchToolConfig(
|
||||
answer_style_config=answer_style_config,
|
||||
document_pruning_config=document_pruning_config,
|
||||
retrieval_options=retrieval_options or RetrievalDetails(),
|
||||
)
|
||||
|
||||
# Create and add the search tool
|
||||
search_tool = SearchTool(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
persona=persona,
|
||||
retrieval_options=search_tool_config.retrieval_options,
|
||||
prompt_config=prompt_config,
|
||||
llm=llm,
|
||||
fast_llm=fast_llm,
|
||||
pruning_config=search_tool_config.document_pruning_config,
|
||||
answer_style_config=search_tool_config.answer_style_config,
|
||||
evaluation_type=(
|
||||
LLMEvaluationType.BASIC
|
||||
if persona.llm_relevance_filter
|
||||
else LLMEvaluationType.SKIP
|
||||
),
|
||||
bypass_acl=bypass_acl,
|
||||
)
|
||||
|
||||
# Add the search tool to the tools list
|
||||
tools.append(search_tool)
|
||||
|
||||
logger.info(
|
||||
"Added search tool for user files that exceed token limit"
|
||||
)
|
||||
|
||||
# Now set force_use_tool.force_use to True
|
||||
force_use_tool.force_use = True
|
||||
force_use_tool.tool_name = SearchTool._NAME
|
||||
|
||||
# Set query argument if not already set
|
||||
if not force_use_tool.args:
|
||||
force_use_tool.args = {"query": final_msg.message}
|
||||
|
||||
# Pass the user file IDs to the search tool
|
||||
if user_file_ids or user_folder_ids:
|
||||
# Create a BaseFilters object with user_file_ids
|
||||
if not retrieval_options:
|
||||
retrieval_options = RetrievalDetails()
|
||||
if not retrieval_options.filters:
|
||||
retrieval_options.filters = BaseFilters()
|
||||
|
||||
# Set user file and folder IDs in the filters
|
||||
retrieval_options.filters.user_file_ids = user_file_ids
|
||||
retrieval_options.filters.user_folder_ids = user_folder_ids
|
||||
|
||||
# Create override kwargs for the search tool
|
||||
override_kwargs = SearchToolOverrideKwargs(
|
||||
force_no_rerank=search_for_ordering_only, # Skip reranking for ordering-only
|
||||
alternate_db_session=None,
|
||||
retrieved_sections_callback=None,
|
||||
skip_query_analysis=search_for_ordering_only, # Skip query analysis for ordering-only
|
||||
user_file_ids=user_file_ids,
|
||||
user_folder_ids=user_folder_ids,
|
||||
ordering_only=search_for_ordering_only, # Set ordering_only flag for fast path
|
||||
)
|
||||
|
||||
# Set the override kwargs in the force_use_tool
|
||||
force_use_tool.override_kwargs = override_kwargs
|
||||
|
||||
if search_for_ordering_only:
|
||||
logger.info(
|
||||
"Fast path: Configured search tool with optimized settings for ordering-only"
|
||||
)
|
||||
logger.info(
|
||||
"Fast path: Skipping reranking and query analysis for ordering-only mode"
|
||||
)
|
||||
logger.info(
|
||||
f"Using {len(user_file_ids or [])} files and {len(user_folder_ids or [])} folders"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Configured search tool to use ",
|
||||
f"{len(user_file_ids or [])} files and {len(user_folder_ids or [])} folders",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Error configuring search tool for user files: {str(e)}"
|
||||
)
|
||||
use_search_for_user_files = False
|
||||
|
||||
# TODO: unify message history with single message history
|
||||
message_history = [
|
||||
PreviousMessage.from_chat_message(msg, files) for msg in history_msgs
|
||||
]
|
||||
if not use_search_for_user_files and user_files:
|
||||
yield UserKnowledgeFilePacket(
|
||||
user_files=[
|
||||
FileDescriptor(
|
||||
id=str(file.file_id), type=ChatFileType.USER_KNOWLEDGE
|
||||
)
|
||||
for file in user_files
|
||||
]
|
||||
)
|
||||
|
||||
if search_for_ordering_only:
|
||||
logger.info(
|
||||
"Performance: Forcing LLMEvaluationType.SKIP to prevent chunk evaluation for ordering-only search"
|
||||
)
|
||||
|
||||
search_request = SearchRequest(
|
||||
query=final_msg.message,
|
||||
evaluation_type=(
|
||||
LLMEvaluationType.BASIC
|
||||
if persona.llm_relevance_filter
|
||||
else LLMEvaluationType.SKIP
|
||||
LLMEvaluationType.SKIP
|
||||
if search_for_ordering_only
|
||||
else (
|
||||
LLMEvaluationType.BASIC
|
||||
if persona.llm_relevance_filter
|
||||
else LLMEvaluationType.SKIP
|
||||
)
|
||||
),
|
||||
human_selected_filters=(
|
||||
retrieval_options.filters if retrieval_options else None
|
||||
@ -737,7 +989,6 @@ def stream_chat_message_objects(
|
||||
),
|
||||
)
|
||||
|
||||
force_use_tool = _get_force_search_settings(new_msg_req, tools)
|
||||
prompt_builder = AnswerPromptBuilder(
|
||||
user_message=default_build_user_message(
|
||||
user_query=final_msg.message,
|
||||
@ -806,8 +1057,22 @@ def stream_chat_message_objects(
|
||||
info = info_by_subq[
|
||||
SubQuestionKey(level=level, question_num=level_question_num)
|
||||
]
|
||||
|
||||
# Skip LLM relevance processing entirely for ordering-only mode
|
||||
if search_for_ordering_only and packet.id == SECTION_RELEVANCE_LIST_ID:
|
||||
logger.info(
|
||||
"Fast path: Completely bypassing section relevance processing for ordering-only mode"
|
||||
)
|
||||
# Skip this packet entirely since it would trigger LLM processing
|
||||
continue
|
||||
|
||||
# TODO: don't need to dedupe here when we do it in agent flow
|
||||
if packet.id == SEARCH_RESPONSE_SUMMARY_ID:
|
||||
if search_for_ordering_only:
|
||||
logger.info(
|
||||
"Fast path: Skipping document deduplication for ordering-only mode"
|
||||
)
|
||||
|
||||
(
|
||||
info.qa_docs_response,
|
||||
info.reference_db_search_docs,
|
||||
@ -817,16 +1082,91 @@ def stream_chat_message_objects(
|
||||
db_session=db_session,
|
||||
selected_search_docs=selected_db_search_docs,
|
||||
# Deduping happens at the last step to avoid harming quality by dropping content early on
|
||||
# Skip deduping completely for ordering-only mode to save time
|
||||
dedupe_docs=(
|
||||
retrieval_options.dedupe_docs
|
||||
if retrieval_options
|
||||
else False
|
||||
False
|
||||
if search_for_ordering_only
|
||||
else (
|
||||
retrieval_options.dedupe_docs
|
||||
if retrieval_options
|
||||
else False
|
||||
)
|
||||
),
|
||||
user_files=user_file_files if search_for_ordering_only else [],
|
||||
loaded_user_files=user_files
|
||||
if search_for_ordering_only
|
||||
else [],
|
||||
)
|
||||
|
||||
# If we're using search just for ordering user files
|
||||
if (
|
||||
search_for_ordering_only
|
||||
and user_files
|
||||
and info.qa_docs_response
|
||||
):
|
||||
logger.info(
|
||||
f"ORDERING: Processing search results for ordering {len(user_files)} user files"
|
||||
)
|
||||
import time
|
||||
|
||||
ordering_start = time.time()
|
||||
|
||||
# Extract document order from search results
|
||||
doc_order = []
|
||||
for doc in info.qa_docs_response.top_documents:
|
||||
doc_id = doc.document_id
|
||||
if str(doc_id).startswith("USER_FILE_CONNECTOR__"):
|
||||
file_id = doc_id.replace("USER_FILE_CONNECTOR__", "")
|
||||
if file_id in file_id_to_user_file:
|
||||
doc_order.append(file_id)
|
||||
|
||||
logger.info(
|
||||
f"ORDERING: Found {len(doc_order)} files from search results"
|
||||
)
|
||||
|
||||
# Add any files that weren't in search results at the end
|
||||
missing_files = [
|
||||
f_id
|
||||
for f_id in file_id_to_user_file.keys()
|
||||
if f_id not in doc_order
|
||||
]
|
||||
|
||||
missing_files.extend(doc_order)
|
||||
doc_order = missing_files
|
||||
|
||||
logger.info(
|
||||
f"ORDERING: Added {len(missing_files)} missing files to the end"
|
||||
)
|
||||
|
||||
# Reorder user files based on search results
|
||||
ordered_user_files = [
|
||||
file_id_to_user_file[f_id]
|
||||
for f_id in doc_order
|
||||
if f_id in file_id_to_user_file
|
||||
]
|
||||
|
||||
time.time() - ordering_start
|
||||
|
||||
yield UserKnowledgeFilePacket(
|
||||
user_files=[
|
||||
FileDescriptor(
|
||||
id=str(file.file_id),
|
||||
type=ChatFileType.USER_KNOWLEDGE,
|
||||
)
|
||||
for file in ordered_user_files
|
||||
]
|
||||
)
|
||||
|
||||
yield info.qa_docs_response
|
||||
elif packet.id == SECTION_RELEVANCE_LIST_ID:
|
||||
relevance_sections = packet.response
|
||||
|
||||
if search_for_ordering_only:
|
||||
logger.info(
|
||||
"Performance: Skipping relevance filtering for ordering-only mode"
|
||||
)
|
||||
continue
|
||||
|
||||
if info.reference_db_search_docs is None:
|
||||
logger.warning(
|
||||
"No reference docs found for relevance filtering"
|
||||
@ -936,7 +1276,7 @@ def stream_chat_message_objects(
|
||||
]
|
||||
info.tool_result = packet
|
||||
yield cast(ChatPacket, packet)
|
||||
logger.debug("Reached end of stream")
|
||||
|
||||
except ValueError as e:
|
||||
logger.exception("Failed to process chat message.")
|
||||
|
||||
@ -1018,10 +1358,16 @@ def stream_chat_message_objects(
|
||||
error=ERROR_TYPE_CANCELLED if answer.is_cancelled() else None,
|
||||
tool_call=(
|
||||
ToolCall(
|
||||
tool_id=tool_name_to_tool_id[info.tool_result.tool_name],
|
||||
tool_name=info.tool_result.tool_name,
|
||||
tool_arguments=info.tool_result.tool_args,
|
||||
tool_result=info.tool_result.tool_result,
|
||||
tool_id=tool_name_to_tool_id.get(info.tool_result.tool_name, 0)
|
||||
if info.tool_result
|
||||
else None,
|
||||
tool_name=info.tool_result.tool_name if info.tool_result else None,
|
||||
tool_arguments=info.tool_result.tool_args
|
||||
if info.tool_result
|
||||
else None,
|
||||
tool_result=info.tool_result.tool_result
|
||||
if info.tool_result
|
||||
else None,
|
||||
)
|
||||
if info.tool_result
|
||||
else None
|
||||
|
@ -19,6 +19,7 @@ def translate_onyx_msg_to_langchain(
|
||||
# attached. Just ignore them for now.
|
||||
if not isinstance(msg, ChatMessage):
|
||||
files = msg.files
|
||||
|
||||
content = build_content_with_imgs(
|
||||
msg.message, files, message_type=msg.message_type, exclude_images=exclude_images
|
||||
)
|
||||
|
@ -180,6 +180,10 @@ def get_tool_call_for_non_tool_calling_llm_impl(
|
||||
if tool_args is None:
|
||||
raise RuntimeError(f"Tool '{tool.name}' did not return args")
|
||||
|
||||
# If we have override_kwargs, add them to the tool_args
|
||||
if force_use_tool.override_kwargs is not None:
|
||||
tool_args["override_kwargs"] = force_use_tool.override_kwargs
|
||||
|
||||
return (tool, tool_args)
|
||||
else:
|
||||
tool_options = check_which_tools_should_run_for_non_tool_calling_llm(
|
||||
|
@ -170,7 +170,7 @@ POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
|
||||
POSTGRES_PASSWORD = urllib.parse.quote_plus(
|
||||
os.environ.get("POSTGRES_PASSWORD") or "password"
|
||||
)
|
||||
POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "localhost"
|
||||
POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "127.0.0.1"
|
||||
POSTGRES_PORT = os.environ.get("POSTGRES_PORT") or "5432"
|
||||
POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
|
||||
AWS_REGION_NAME = os.environ.get("AWS_REGION_NAME") or "us-east-2"
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
INPUT_PROMPT_YAML = "./onyx/seeding/input_prompts.yaml"
|
||||
PROMPTS_YAML = "./onyx/seeding/prompts.yaml"
|
||||
PERSONAS_YAML = "./onyx/seeding/personas.yaml"
|
||||
|
||||
USER_FOLDERS_YAML = "./onyx/seeding/user_folders.yaml"
|
||||
NUM_RETURNED_HITS = 50
|
||||
# Used for LLM filtering and reranking
|
||||
# We want this to be approximately the number of results we want to show on the first page
|
||||
|
@ -102,6 +102,8 @@ CELERY_GENERIC_BEAT_LOCK_TIMEOUT = 120
|
||||
|
||||
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 120
|
||||
|
||||
CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT = 120
|
||||
|
||||
CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
|
||||
|
||||
|
||||
@ -269,6 +271,7 @@ class FileOrigin(str, Enum):
|
||||
CONNECTOR = "connector"
|
||||
GENERATED_REPORT = "generated_report"
|
||||
INDEXING_CHECKPOINT = "indexing_checkpoint"
|
||||
PLAINTEXT_CACHE = "plaintext_cache"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
@ -309,6 +312,7 @@ class OnyxCeleryQueues:
|
||||
|
||||
# Indexing queue
|
||||
CONNECTOR_INDEXING = "connector_indexing"
|
||||
USER_FILES_INDEXING = "user_files_indexing"
|
||||
|
||||
# Monitoring queue
|
||||
MONITORING = "monitoring"
|
||||
@ -327,6 +331,7 @@ class OnyxRedisLocks:
|
||||
CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK = (
|
||||
"da_lock:check_connector_external_group_sync_beat"
|
||||
)
|
||||
CHECK_USER_FILE_FOLDER_SYNC_BEAT_LOCK = "da_lock:check_user_file_folder_sync_beat"
|
||||
MONITOR_BACKGROUND_PROCESSES_LOCK = "da_lock:monitor_background_processes"
|
||||
CHECK_AVAILABLE_TENANTS_LOCK = "da_lock:check_available_tenants"
|
||||
PRE_PROVISION_TENANT_LOCK = "da_lock:pre_provision_tenant"
|
||||
@ -397,6 +402,7 @@ class OnyxCeleryTask:
|
||||
|
||||
# Tenant pre-provisioning
|
||||
PRE_PROVISION_TENANT = f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_pre_provision_tenant"
|
||||
UPDATE_USER_FILE_FOLDER_METADATA = "update_user_file_folder_metadata"
|
||||
|
||||
CHECK_FOR_CONNECTOR_DELETION = "check_for_connector_deletion_task"
|
||||
CHECK_FOR_VESPA_SYNC_TASK = "check_for_vespa_sync_task"
|
||||
@ -405,6 +411,7 @@ class OnyxCeleryTask:
|
||||
CHECK_FOR_DOC_PERMISSIONS_SYNC = "check_for_doc_permissions_sync"
|
||||
CHECK_FOR_EXTERNAL_GROUP_SYNC = "check_for_external_group_sync"
|
||||
CHECK_FOR_LLM_MODEL_UPDATE = "check_for_llm_model_update"
|
||||
CHECK_FOR_USER_FILE_FOLDER_SYNC = "check_for_user_file_folder_sync"
|
||||
|
||||
# Connector checkpoint cleanup
|
||||
CHECK_FOR_CHECKPOINT_CLEANUP = "check_for_checkpoint_cleanup"
|
||||
|
@ -276,7 +276,26 @@ class GithubConnector(CheckpointConnector[GithubConnectorCheckpoint]):
|
||||
return checkpoint
|
||||
|
||||
assert checkpoint.cached_repo is not None, "No repo saved in checkpoint"
|
||||
repo = checkpoint.cached_repo.to_Repository(self.github_client.requester)
|
||||
|
||||
# Try to access the requester - different PyGithub versions may use different attribute names
|
||||
try:
|
||||
# Try direct access to a known attribute name first
|
||||
if hasattr(self.github_client, "_requester"):
|
||||
requester = self.github_client._requester
|
||||
elif hasattr(self.github_client, "_Github__requester"):
|
||||
requester = self.github_client._Github__requester
|
||||
else:
|
||||
# If we can't find the requester attribute, we need to fall back to recreating the repo
|
||||
raise AttributeError("Could not find requester attribute")
|
||||
|
||||
repo = checkpoint.cached_repo.to_Repository(requester)
|
||||
except Exception as e:
|
||||
# If all else fails, re-fetch the repo directly
|
||||
logger.warning(
|
||||
f"Failed to deserialize repository: {e}. Attempting to re-fetch."
|
||||
)
|
||||
repo_id = checkpoint.cached_repo.id
|
||||
repo = self.github_client.get_repo(repo_id)
|
||||
|
||||
if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS:
|
||||
logger.info(f"Fetching PRs for repo: {repo.name}")
|
||||
|
@ -105,6 +105,8 @@ class BaseFilters(BaseModel):
|
||||
document_set: list[str] | None = None
|
||||
time_cutoff: datetime | None = None
|
||||
tags: list[Tag] | None = None
|
||||
user_file_ids: list[int] | None = None
|
||||
user_folder_ids: list[int] | None = None
|
||||
|
||||
|
||||
class IndexFilters(BaseFilters):
|
||||
|
@ -158,6 +158,47 @@ class SearchPipeline:
|
||||
|
||||
return cast(list[InferenceChunk], self._retrieved_chunks)
|
||||
|
||||
def get_ordering_only_chunks(
|
||||
self,
|
||||
query: str,
|
||||
user_file_ids: list[int] | None = None,
|
||||
user_folder_ids: list[int] | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
"""Optimized method that only retrieves chunks for ordering purposes.
|
||||
Skips all extra processing and uses minimal configuration to speed up retrieval.
|
||||
"""
|
||||
logger.info("Fast path: Using optimized chunk retrieval for ordering-only mode")
|
||||
|
||||
# Create minimal filters with just user file/folder IDs
|
||||
filters = IndexFilters(
|
||||
user_file_ids=user_file_ids or [],
|
||||
user_folder_ids=user_folder_ids or [],
|
||||
access_control_list=None,
|
||||
)
|
||||
|
||||
# Use a simplified query that skips all unnecessary processing
|
||||
minimal_query = SearchQuery(
|
||||
query=query,
|
||||
search_type=SearchType.SEMANTIC,
|
||||
filters=filters,
|
||||
# Set minimal options needed for retrieval
|
||||
evaluation_type=LLMEvaluationType.SKIP,
|
||||
recency_bias_multiplier=1.0,
|
||||
chunks_above=0, # No need for surrounding context
|
||||
chunks_below=0, # No need for surrounding context
|
||||
processed_keywords=[], # Empty list instead of None
|
||||
rerank_settings=None,
|
||||
hybrid_alpha=0.0,
|
||||
max_llm_filter_sections=0,
|
||||
)
|
||||
|
||||
# Retrieve chunks using the minimal configuration
|
||||
return retrieve_chunks(
|
||||
query=minimal_query,
|
||||
document_index=self.document_index,
|
||||
db_session=self.db_session,
|
||||
)
|
||||
|
||||
@log_function_time(print_only=True)
|
||||
def _get_sections(self) -> list[InferenceSection]:
|
||||
"""Returns an expanded section from each of the chunks.
|
||||
@ -391,6 +432,10 @@ class SearchPipeline:
|
||||
self.search_query.evaluation_type == LLMEvaluationType.SKIP
|
||||
or DISABLE_LLM_DOC_RELEVANCE
|
||||
):
|
||||
if self.search_query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.info(
|
||||
"Fast path: Skipping section relevance evaluation for ordering-only mode"
|
||||
)
|
||||
return None
|
||||
|
||||
if self.search_query.evaluation_type == LLMEvaluationType.UNSPECIFIED:
|
||||
|
@ -11,6 +11,7 @@ from langchain_core.messages import SystemMessage
|
||||
from onyx.chat.models import SectionRelevancePiece
|
||||
from onyx.configs.app_configs import BLURB_SIZE
|
||||
from onyx.configs.app_configs import IMAGE_ANALYSIS_SYSTEM_PROMPT
|
||||
from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
|
||||
from onyx.configs.constants import RETURN_SEPARATOR
|
||||
from onyx.configs.llm_configs import get_search_time_image_analysis_enabled
|
||||
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
||||
@ -366,6 +367,21 @@ def filter_sections(
|
||||
|
||||
Returns a list of the unique chunk IDs that were marked as relevant
|
||||
"""
|
||||
# Log evaluation type to help with debugging
|
||||
logger.info(f"filter_sections called with evaluation_type={query.evaluation_type}")
|
||||
|
||||
# Fast path: immediately return empty list for SKIP evaluation type (ordering-only mode)
|
||||
if query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
return []
|
||||
|
||||
# Additional safeguard: Log a warning if this function is ever called with SKIP evaluation type
|
||||
# This should never happen if our fast paths are working correctly
|
||||
if query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.warning(
|
||||
"WARNING: filter_sections called with SKIP evaluation_type. This should never happen!"
|
||||
)
|
||||
return []
|
||||
|
||||
sections_to_filter = sections_to_filter[: query.max_llm_filter_sections]
|
||||
|
||||
contents = [
|
||||
@ -398,6 +414,16 @@ def search_postprocessing(
|
||||
llm: LLM,
|
||||
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
|
||||
) -> Iterator[list[InferenceSection] | list[SectionRelevancePiece]]:
|
||||
# Fast path for ordering-only: detect it by checking if evaluation_type is SKIP
|
||||
if search_query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.info(
|
||||
"Fast path: Detected ordering-only mode, bypassing all post-processing"
|
||||
)
|
||||
# Immediately yield the sections without any processing and an empty relevance list
|
||||
yield retrieved_sections
|
||||
yield cast(list[SectionRelevancePiece], [])
|
||||
return
|
||||
|
||||
post_processing_tasks: list[FunctionCall] = []
|
||||
|
||||
if not retrieved_sections:
|
||||
@ -434,10 +460,14 @@ def search_postprocessing(
|
||||
sections_yielded = True
|
||||
|
||||
llm_filter_task_id = None
|
||||
if search_query.evaluation_type in [
|
||||
LLMEvaluationType.BASIC,
|
||||
LLMEvaluationType.UNSPECIFIED,
|
||||
]:
|
||||
# Only add LLM filtering if not in SKIP mode and if LLM doc relevance is not disabled
|
||||
if (
|
||||
search_query.evaluation_type not in [LLMEvaluationType.SKIP]
|
||||
and not DISABLE_LLM_DOC_RELEVANCE
|
||||
and search_query.evaluation_type
|
||||
in [LLMEvaluationType.BASIC, LLMEvaluationType.UNSPECIFIED]
|
||||
):
|
||||
logger.info("Adding LLM filtering task for document relevance evaluation")
|
||||
post_processing_tasks.append(
|
||||
FunctionCall(
|
||||
filter_sections,
|
||||
@ -449,6 +479,10 @@ def search_postprocessing(
|
||||
)
|
||||
)
|
||||
llm_filter_task_id = post_processing_tasks[-1].result_id
|
||||
elif search_query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.info("Fast path: Skipping LLM filtering task for ordering-only mode")
|
||||
elif DISABLE_LLM_DOC_RELEVANCE:
|
||||
logger.info("Skipping LLM filtering task because LLM doc relevance is disabled")
|
||||
|
||||
post_processing_results = (
|
||||
run_functions_in_parallel(post_processing_tasks)
|
||||
|
@ -165,7 +165,18 @@ def retrieval_preprocessing(
|
||||
user_acl_filters = (
|
||||
None if bypass_acl else build_access_filters_for_user(user, db_session)
|
||||
)
|
||||
user_file_ids = preset_filters.user_file_ids or []
|
||||
user_folder_ids = preset_filters.user_folder_ids or []
|
||||
if persona and persona.user_files:
|
||||
user_file_ids = user_file_ids + [
|
||||
file.id
|
||||
for file in persona.user_files
|
||||
if file.id not in (preset_filters.user_file_ids or [])
|
||||
]
|
||||
|
||||
final_filters = IndexFilters(
|
||||
user_file_ids=user_file_ids,
|
||||
user_folder_ids=user_folder_ids,
|
||||
source_type=preset_filters.source_type or predicted_source_filters,
|
||||
document_set=preset_filters.document_set,
|
||||
time_cutoff=time_filter or predicted_time_cutoff,
|
||||
|
@ -26,6 +26,7 @@ from onyx.agents.agent_search.shared_graph_utils.models import (
|
||||
from onyx.auth.schemas import UserRole
|
||||
from onyx.chat.models import DocumentRelevance
|
||||
from onyx.configs.chat_configs import HARD_DELETE_CHATS
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.context.search.models import InferenceSection
|
||||
from onyx.context.search.models import RetrievalDocs
|
||||
@ -44,9 +45,11 @@ from onyx.db.models import SearchDoc
|
||||
from onyx.db.models import SearchDoc as DBSearchDoc
|
||||
from onyx.db.models import ToolCall
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.persona import get_best_persona_id_for_user
|
||||
from onyx.db.pg_file_store import delete_lobj_by_name
|
||||
from onyx.file_store.models import FileDescriptor
|
||||
from onyx.file_store.models import InMemoryChatFile
|
||||
from onyx.llm.override_models import LLMOverride
|
||||
from onyx.llm.override_models import PromptOverride
|
||||
from onyx.server.query_and_chat.models import ChatMessageDetail
|
||||
@ -854,6 +857,87 @@ def get_db_search_doc_by_id(doc_id: int, db_session: Session) -> DBSearchDoc | N
|
||||
return search_doc
|
||||
|
||||
|
||||
def create_search_doc_from_user_file(
|
||||
db_user_file: UserFile, associated_chat_file: InMemoryChatFile, db_session: Session
|
||||
) -> SearchDoc:
|
||||
"""Create a SearchDoc in the database from a UserFile and return it.
|
||||
This ensures proper ID generation by SQLAlchemy and prevents duplicate key errors.
|
||||
"""
|
||||
blurb = ""
|
||||
if associated_chat_file and associated_chat_file.content:
|
||||
try:
|
||||
# Try to decode as UTF-8, but handle errors gracefully
|
||||
content_sample = associated_chat_file.content[:100]
|
||||
# Remove null bytes which can cause SQL errors
|
||||
content_sample = content_sample.replace(b"\x00", b"")
|
||||
blurb = content_sample.decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
# If decoding fails completely, provide a generic description
|
||||
blurb = f"[Binary file: {db_user_file.name}]"
|
||||
|
||||
db_search_doc = SearchDoc(
|
||||
document_id=db_user_file.document_id,
|
||||
chunk_ind=0, # Default to 0 for user files
|
||||
semantic_id=db_user_file.name,
|
||||
link=db_user_file.link_url,
|
||||
blurb=blurb,
|
||||
source_type=DocumentSource.FILE, # Assuming internal source for user files
|
||||
boost=0, # Default boost
|
||||
hidden=False, # Default visibility
|
||||
doc_metadata={}, # Empty metadata
|
||||
score=0.0, # Default score of 0.0 instead of None
|
||||
is_relevant=None, # No relevance initially
|
||||
relevance_explanation=None, # No explanation initially
|
||||
match_highlights=[], # No highlights initially
|
||||
updated_at=db_user_file.created_at, # Use created_at as updated_at
|
||||
primary_owners=[], # Empty list instead of None
|
||||
secondary_owners=[], # Empty list instead of None
|
||||
is_internet=False, # Not from internet
|
||||
)
|
||||
|
||||
db_session.add(db_search_doc)
|
||||
db_session.flush() # Get the ID but don't commit yet
|
||||
|
||||
return db_search_doc
|
||||
|
||||
|
||||
def translate_db_user_file_to_search_doc(
|
||||
db_user_file: UserFile, associated_chat_file: InMemoryChatFile
|
||||
) -> SearchDoc:
|
||||
blurb = ""
|
||||
if associated_chat_file and associated_chat_file.content:
|
||||
try:
|
||||
# Try to decode as UTF-8, but handle errors gracefully
|
||||
content_sample = associated_chat_file.content[:100]
|
||||
# Remove null bytes which can cause SQL errors
|
||||
content_sample = content_sample.replace(b"\x00", b"")
|
||||
blurb = content_sample.decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
# If decoding fails completely, provide a generic description
|
||||
blurb = f"[Binary file: {db_user_file.name}]"
|
||||
|
||||
return SearchDoc(
|
||||
# Don't set ID - let SQLAlchemy auto-generate it
|
||||
document_id=db_user_file.document_id,
|
||||
chunk_ind=0, # Default to 0 for user files
|
||||
semantic_id=db_user_file.name,
|
||||
link=db_user_file.link_url,
|
||||
blurb=blurb,
|
||||
source_type=DocumentSource.FILE, # Assuming internal source for user files
|
||||
boost=0, # Default boost
|
||||
hidden=False, # Default visibility
|
||||
doc_metadata={}, # Empty metadata
|
||||
score=0.0, # Default score of 0.0 instead of None
|
||||
is_relevant=None, # No relevance initially
|
||||
relevance_explanation=None, # No explanation initially
|
||||
match_highlights=[], # No highlights initially
|
||||
updated_at=db_user_file.created_at, # Use created_at as updated_at
|
||||
primary_owners=[], # Empty list instead of None
|
||||
secondary_owners=[], # Empty list instead of None
|
||||
is_internet=False, # Not from internet
|
||||
)
|
||||
|
||||
|
||||
def translate_db_search_doc_to_server_search_doc(
|
||||
db_search_doc: SearchDoc,
|
||||
remove_doc_content: bool = False,
|
||||
|
@ -27,6 +27,7 @@ from onyx.db.models import IndexModelStatus
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import User__UserGroup
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserGroup__ConnectorCredentialPair
|
||||
from onyx.db.models import UserRole
|
||||
from onyx.server.models import StatusResponse
|
||||
@ -106,11 +107,13 @@ def get_connector_credential_pairs_for_user(
|
||||
eager_load_connector: bool = False,
|
||||
eager_load_credential: bool = False,
|
||||
eager_load_user: bool = False,
|
||||
include_user_files: bool = False,
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
if eager_load_user:
|
||||
assert (
|
||||
eager_load_credential
|
||||
), "eager_load_credential must be True if eager_load_user is True"
|
||||
|
||||
stmt = select(ConnectorCredentialPair).distinct()
|
||||
|
||||
if eager_load_connector:
|
||||
@ -126,6 +129,9 @@ def get_connector_credential_pairs_for_user(
|
||||
if ids:
|
||||
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
|
||||
|
||||
if not include_user_files:
|
||||
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
|
||||
|
||||
return list(db_session.scalars(stmt).unique().all())
|
||||
|
||||
|
||||
@ -153,14 +159,16 @@ def get_connector_credential_pairs_for_user_parallel(
|
||||
|
||||
|
||||
def get_connector_credential_pairs(
|
||||
db_session: Session,
|
||||
ids: list[int] | None = None,
|
||||
db_session: Session, ids: list[int] | None = None, include_user_files: bool = False
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
stmt = select(ConnectorCredentialPair).distinct()
|
||||
|
||||
if ids:
|
||||
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
|
||||
|
||||
if not include_user_files:
|
||||
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
|
||||
|
||||
return list(db_session.scalars(stmt).all())
|
||||
|
||||
|
||||
@ -207,12 +215,15 @@ def get_connector_credential_pair_for_user(
|
||||
connector_id: int,
|
||||
credential_id: int,
|
||||
user: User | None,
|
||||
include_user_files: bool = False,
|
||||
get_editable: bool = True,
|
||||
) -> ConnectorCredentialPair | None:
|
||||
stmt = select(ConnectorCredentialPair)
|
||||
stmt = _add_user_filters(stmt, user, get_editable)
|
||||
stmt = stmt.where(ConnectorCredentialPair.connector_id == connector_id)
|
||||
stmt = stmt.where(ConnectorCredentialPair.credential_id == credential_id)
|
||||
if not include_user_files:
|
||||
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
|
||||
result = db_session.execute(stmt)
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
@ -321,6 +332,9 @@ def _update_connector_credential_pair(
|
||||
cc_pair.total_docs_indexed += net_docs
|
||||
if status is not None:
|
||||
cc_pair.status = status
|
||||
if cc_pair.is_user_file:
|
||||
cc_pair.status = ConnectorCredentialPairStatus.PAUSED
|
||||
|
||||
db_session.commit()
|
||||
|
||||
|
||||
@ -446,6 +460,7 @@ def add_credential_to_connector(
|
||||
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
|
||||
last_successful_index_time: datetime | None = None,
|
||||
seeding_flow: bool = False,
|
||||
is_user_file: bool = False,
|
||||
) -> StatusResponse:
|
||||
connector = fetch_connector_by_id(connector_id, db_session)
|
||||
|
||||
@ -511,6 +526,7 @@ def add_credential_to_connector(
|
||||
access_type=access_type,
|
||||
auto_sync_options=auto_sync_options,
|
||||
last_successful_index_time=last_successful_index_time,
|
||||
is_user_file=is_user_file,
|
||||
)
|
||||
db_session.add(association)
|
||||
db_session.flush() # make sure the association has an id
|
||||
@ -587,8 +603,12 @@ def remove_credential_from_connector(
|
||||
|
||||
def fetch_connector_credential_pairs(
|
||||
db_session: Session,
|
||||
include_user_files: bool = False,
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
return db_session.query(ConnectorCredentialPair).all()
|
||||
stmt = select(ConnectorCredentialPair)
|
||||
if not include_user_files:
|
||||
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
|
||||
return list(db_session.scalars(stmt).unique().all())
|
||||
|
||||
|
||||
def resync_cc_pair(
|
||||
@ -634,3 +654,23 @@ def resync_cc_pair(
|
||||
)
|
||||
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def get_connector_credential_pairs_with_user_files(
|
||||
db_session: Session,
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
"""
|
||||
Get all connector credential pairs that have associated user files.
|
||||
|
||||
Args:
|
||||
db_session: Database session
|
||||
|
||||
Returns:
|
||||
List of ConnectorCredentialPair objects that have user files
|
||||
"""
|
||||
return (
|
||||
db_session.query(ConnectorCredentialPair)
|
||||
.join(UserFile, UserFile.cc_pair_id == ConnectorCredentialPair.id)
|
||||
.distinct()
|
||||
.all()
|
||||
)
|
||||
|
@ -605,7 +605,6 @@ def fetch_document_sets_for_document(
|
||||
result = fetch_document_sets_for_documents([document_id], db_session)
|
||||
if not result:
|
||||
return []
|
||||
|
||||
return result[0][1]
|
||||
|
||||
|
||||
|
@ -212,6 +212,10 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
back_populates="creator",
|
||||
primaryjoin="User.id == foreign(ConnectorCredentialPair.creator_id)",
|
||||
)
|
||||
folders: Mapped[list["UserFolder"]] = relationship(
|
||||
"UserFolder", back_populates="user"
|
||||
)
|
||||
files: Mapped[list["UserFile"]] = relationship("UserFile", back_populates="user")
|
||||
|
||||
@validates("email")
|
||||
def validate_email(self, key: str, value: str) -> str:
|
||||
@ -419,6 +423,7 @@ class ConnectorCredentialPair(Base):
|
||||
"""
|
||||
|
||||
__tablename__ = "connector_credential_pair"
|
||||
is_user_file: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
# NOTE: this `id` column has to use `Sequence` instead of `autoincrement=True`
|
||||
# due to some SQLAlchemy quirks + this not being a primary key column
|
||||
id: Mapped[int] = mapped_column(
|
||||
@ -505,6 +510,10 @@ class ConnectorCredentialPair(Base):
|
||||
primaryjoin="foreign(ConnectorCredentialPair.creator_id) == remote(User.id)",
|
||||
)
|
||||
|
||||
user_file: Mapped["UserFile"] = relationship(
|
||||
"UserFile", back_populates="cc_pair", uselist=False
|
||||
)
|
||||
|
||||
background_errors: Mapped[list["BackgroundError"]] = relationship(
|
||||
"BackgroundError", back_populates="cc_pair", cascade="all, delete-orphan"
|
||||
)
|
||||
@ -1808,6 +1817,17 @@ class Persona(Base):
|
||||
secondary="persona__user_group",
|
||||
viewonly=True,
|
||||
)
|
||||
# Relationship to UserFile
|
||||
user_files: Mapped[list["UserFile"]] = relationship(
|
||||
"UserFile",
|
||||
secondary="persona__user_file",
|
||||
back_populates="assistants",
|
||||
)
|
||||
user_folders: Mapped[list["UserFolder"]] = relationship(
|
||||
"UserFolder",
|
||||
secondary="persona__user_folder",
|
||||
back_populates="assistants",
|
||||
)
|
||||
labels: Mapped[list["PersonaLabel"]] = relationship(
|
||||
"PersonaLabel",
|
||||
secondary=Persona__PersonaLabel.__table__,
|
||||
@ -1824,6 +1844,24 @@ class Persona(Base):
|
||||
)
|
||||
|
||||
|
||||
class Persona__UserFolder(Base):
|
||||
__tablename__ = "persona__user_folder"
|
||||
|
||||
persona_id: Mapped[int] = mapped_column(ForeignKey("persona.id"), primary_key=True)
|
||||
user_folder_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("user_folder.id"), primary_key=True
|
||||
)
|
||||
|
||||
|
||||
class Persona__UserFile(Base):
|
||||
__tablename__ = "persona__user_file"
|
||||
|
||||
persona_id: Mapped[int] = mapped_column(ForeignKey("persona.id"), primary_key=True)
|
||||
user_file_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("user_file.id"), primary_key=True
|
||||
)
|
||||
|
||||
|
||||
class PersonaLabel(Base):
|
||||
__tablename__ = "persona_label"
|
||||
|
||||
@ -2346,6 +2384,64 @@ class InputPrompt__User(Base):
|
||||
disabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
|
||||
|
||||
class UserFolder(Base):
|
||||
__tablename__ = "user_folder"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=False)
|
||||
name: Mapped[str] = mapped_column(nullable=False)
|
||||
description: Mapped[str] = mapped_column(nullable=False)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now()
|
||||
)
|
||||
user: Mapped["User"] = relationship(back_populates="folders")
|
||||
files: Mapped[list["UserFile"]] = relationship(back_populates="folder")
|
||||
assistants: Mapped[list["Persona"]] = relationship(
|
||||
"Persona",
|
||||
secondary=Persona__UserFolder.__table__,
|
||||
back_populates="user_folders",
|
||||
)
|
||||
|
||||
|
||||
class UserDocument(str, Enum):
|
||||
CHAT = "chat"
|
||||
RECENT = "recent"
|
||||
FILE = "file"
|
||||
|
||||
|
||||
class UserFile(Base):
|
||||
__tablename__ = "user_file"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=False)
|
||||
assistants: Mapped[list["Persona"]] = relationship(
|
||||
"Persona",
|
||||
secondary=Persona__UserFile.__table__,
|
||||
back_populates="user_files",
|
||||
)
|
||||
folder_id: Mapped[int | None] = mapped_column(
|
||||
ForeignKey("user_folder.id"), nullable=True
|
||||
)
|
||||
|
||||
file_id: Mapped[str] = mapped_column(nullable=False)
|
||||
document_id: Mapped[str] = mapped_column(nullable=False)
|
||||
name: Mapped[str] = mapped_column(nullable=False)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
default=datetime.datetime.utcnow
|
||||
)
|
||||
user: Mapped["User"] = relationship(back_populates="files")
|
||||
folder: Mapped["UserFolder"] = relationship(back_populates="files")
|
||||
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
cc_pair_id: Mapped[int | None] = mapped_column(
|
||||
ForeignKey("connector_credential_pair.id"), nullable=True, unique=True
|
||||
)
|
||||
cc_pair: Mapped["ConnectorCredentialPair"] = relationship(
|
||||
"ConnectorCredentialPair", back_populates="user_file"
|
||||
)
|
||||
link_url: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
|
||||
|
||||
"""
|
||||
Multi-tenancy related tables
|
||||
"""
|
||||
|
@ -33,6 +33,8 @@ from onyx.db.models import StarterMessage
|
||||
from onyx.db.models import Tool
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import User__UserGroup
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserFolder
|
||||
from onyx.db.models import UserGroup
|
||||
from onyx.db.notification import create_notification
|
||||
from onyx.server.features.persona.models import PersonaSharedNotificationData
|
||||
@ -209,7 +211,6 @@ def create_update_persona(
|
||||
if not all_prompt_ids:
|
||||
raise ValueError("No prompt IDs provided")
|
||||
|
||||
is_default_persona: bool | None = create_persona_request.is_default_persona
|
||||
# Default persona validation
|
||||
if create_persona_request.is_default_persona:
|
||||
if not create_persona_request.is_public:
|
||||
@ -221,7 +222,7 @@ def create_update_persona(
|
||||
user.role == UserRole.CURATOR
|
||||
or user.role == UserRole.GLOBAL_CURATOR
|
||||
):
|
||||
is_default_persona = None
|
||||
pass
|
||||
elif user.role != UserRole.ADMIN:
|
||||
raise ValueError("Only admins can make a default persona")
|
||||
|
||||
@ -249,7 +250,9 @@ def create_update_persona(
|
||||
num_chunks=create_persona_request.num_chunks,
|
||||
llm_relevance_filter=create_persona_request.llm_relevance_filter,
|
||||
llm_filter_extraction=create_persona_request.llm_filter_extraction,
|
||||
is_default_persona=is_default_persona,
|
||||
is_default_persona=create_persona_request.is_default_persona,
|
||||
user_file_ids=create_persona_request.user_file_ids,
|
||||
user_folder_ids=create_persona_request.user_folder_ids,
|
||||
)
|
||||
|
||||
versioned_make_persona_private = fetch_versioned_implementation(
|
||||
@ -344,6 +347,8 @@ def get_personas_for_user(
|
||||
selectinload(Persona.groups),
|
||||
selectinload(Persona.users),
|
||||
selectinload(Persona.labels),
|
||||
selectinload(Persona.user_files),
|
||||
selectinload(Persona.user_folders),
|
||||
)
|
||||
|
||||
results = db_session.execute(stmt).scalars().all()
|
||||
@ -438,6 +443,8 @@ def upsert_persona(
|
||||
builtin_persona: bool = False,
|
||||
is_default_persona: bool | None = None,
|
||||
label_ids: list[int] | None = None,
|
||||
user_file_ids: list[int] | None = None,
|
||||
user_folder_ids: list[int] | None = None,
|
||||
chunks_above: int = CONTEXT_CHUNKS_ABOVE,
|
||||
chunks_below: int = CONTEXT_CHUNKS_BELOW,
|
||||
) -> Persona:
|
||||
@ -463,6 +470,7 @@ def upsert_persona(
|
||||
user=user,
|
||||
get_editable=True,
|
||||
)
|
||||
|
||||
# Fetch and attach tools by IDs
|
||||
tools = None
|
||||
if tool_ids is not None:
|
||||
@ -481,6 +489,26 @@ def upsert_persona(
|
||||
if not document_sets and document_set_ids:
|
||||
raise ValueError("document_sets not found")
|
||||
|
||||
# Fetch and attach user_files by IDs
|
||||
user_files = None
|
||||
if user_file_ids is not None:
|
||||
user_files = (
|
||||
db_session.query(UserFile).filter(UserFile.id.in_(user_file_ids)).all()
|
||||
)
|
||||
if not user_files and user_file_ids:
|
||||
raise ValueError("user_files not found")
|
||||
|
||||
# Fetch and attach user_folders by IDs
|
||||
user_folders = None
|
||||
if user_folder_ids is not None:
|
||||
user_folders = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id.in_(user_folder_ids))
|
||||
.all()
|
||||
)
|
||||
if not user_folders and user_folder_ids:
|
||||
raise ValueError("user_folders not found")
|
||||
|
||||
# Fetch and attach prompts by IDs
|
||||
prompts = None
|
||||
if prompt_ids is not None:
|
||||
@ -549,6 +577,14 @@ def upsert_persona(
|
||||
if tools is not None:
|
||||
existing_persona.tools = tools or []
|
||||
|
||||
if user_file_ids is not None:
|
||||
existing_persona.user_files.clear()
|
||||
existing_persona.user_files = user_files or []
|
||||
|
||||
if user_folder_ids is not None:
|
||||
existing_persona.user_folders.clear()
|
||||
existing_persona.user_folders = user_folders or []
|
||||
|
||||
# We should only update display priority if it is not already set
|
||||
if existing_persona.display_priority is None:
|
||||
existing_persona.display_priority = display_priority
|
||||
@ -590,6 +626,8 @@ def upsert_persona(
|
||||
is_default_persona=is_default_persona
|
||||
if is_default_persona is not None
|
||||
else False,
|
||||
user_folders=user_folders or [],
|
||||
user_files=user_files or [],
|
||||
labels=labels or [],
|
||||
)
|
||||
db_session.add(new_persona)
|
||||
|
466
backend/onyx/db/user_documents.py
Normal file
@ -0,0 +1,466 @@
|
||||
import datetime
|
||||
import time
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import UploadFile
|
||||
from sqlalchemy import and_
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.auth.users import get_current_tenant_id
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.connector import create_connector
|
||||
from onyx.db.connector_credential_pair import add_credential_to_connector
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.models import Document
|
||||
from onyx.db.models import DocumentByConnectorCredentialPair
|
||||
from onyx.db.models import Persona
|
||||
from onyx.db.models import Persona__UserFile
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserFolder
|
||||
from onyx.server.documents.connector import trigger_indexing_for_cc_pair
|
||||
from onyx.server.documents.connector import upload_files
|
||||
from onyx.server.documents.models import ConnectorBase
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from onyx.server.models import StatusResponse
|
||||
|
||||
USER_FILE_CONSTANT = "USER_FILE_CONNECTOR"
|
||||
|
||||
|
||||
def create_user_files(
|
||||
files: List[UploadFile],
|
||||
folder_id: int | None,
|
||||
user: User | None,
|
||||
db_session: Session,
|
||||
link_url: str | None = None,
|
||||
) -> list[UserFile]:
|
||||
upload_response = upload_files(files, db_session)
|
||||
user_files = []
|
||||
|
||||
for file_path, file in zip(upload_response.file_paths, files):
|
||||
new_file = UserFile(
|
||||
user_id=user.id if user else None,
|
||||
folder_id=folder_id,
|
||||
file_id=file_path,
|
||||
document_id="USER_FILE_CONNECTOR__" + file_path,
|
||||
name=file.filename,
|
||||
token_count=None,
|
||||
link_url=link_url,
|
||||
)
|
||||
db_session.add(new_file)
|
||||
user_files.append(new_file)
|
||||
db_session.commit()
|
||||
return user_files
|
||||
|
||||
|
||||
def create_user_file_with_indexing(
|
||||
files: List[UploadFile],
|
||||
folder_id: int | None,
|
||||
user: User,
|
||||
db_session: Session,
|
||||
trigger_index: bool = True,
|
||||
) -> list[UserFile]:
|
||||
"""Create user files and trigger immediate indexing"""
|
||||
# Create the user files first
|
||||
user_files = create_user_files(files, folder_id, user, db_session)
|
||||
|
||||
# Create connector and credential for each file
|
||||
for user_file in user_files:
|
||||
cc_pair = create_file_connector_credential(user_file, user, db_session)
|
||||
user_file.cc_pair_id = cc_pair.data
|
||||
|
||||
db_session.commit()
|
||||
|
||||
# Trigger immediate high-priority indexing for all created files
|
||||
if trigger_index:
|
||||
tenant_id = get_current_tenant_id()
|
||||
for user_file in user_files:
|
||||
# Use the existing trigger_indexing_for_cc_pair function but with highest priority
|
||||
if user_file.cc_pair_id:
|
||||
trigger_indexing_for_cc_pair(
|
||||
[],
|
||||
user_file.cc_pair.connector_id,
|
||||
False,
|
||||
tenant_id,
|
||||
db_session,
|
||||
is_user_file=True,
|
||||
)
|
||||
|
||||
return user_files
|
||||
|
||||
|
||||
def create_file_connector_credential(
|
||||
user_file: UserFile, user: User, db_session: Session
|
||||
) -> StatusResponse:
|
||||
"""Create connector and credential for a user file"""
|
||||
connector_base = ConnectorBase(
|
||||
name=f"UserFile-{user_file.file_id}-{int(time.time())}",
|
||||
source=DocumentSource.FILE,
|
||||
input_type=InputType.LOAD_STATE,
|
||||
connector_specific_config={
|
||||
"file_locations": [user_file.file_id],
|
||||
},
|
||||
refresh_freq=None,
|
||||
prune_freq=None,
|
||||
indexing_start=None,
|
||||
)
|
||||
|
||||
connector = create_connector(db_session=db_session, connector_data=connector_base)
|
||||
|
||||
credential_info = CredentialBase(
|
||||
credential_json={},
|
||||
admin_public=True,
|
||||
source=DocumentSource.FILE,
|
||||
curator_public=True,
|
||||
groups=[],
|
||||
name=f"UserFileCredential-{user_file.file_id}-{int(time.time())}",
|
||||
is_user_file=True,
|
||||
)
|
||||
|
||||
credential = create_credential(credential_info, user, db_session)
|
||||
|
||||
return add_credential_to_connector(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
connector_id=connector.id,
|
||||
credential_id=credential.id,
|
||||
cc_pair_name=f"UserFileCCPair-{user_file.file_id}-{int(time.time())}",
|
||||
access_type=AccessType.PRIVATE,
|
||||
auto_sync_options=None,
|
||||
groups=[],
|
||||
is_user_file=True,
|
||||
)
|
||||
|
||||
|
||||
def get_user_file_indexing_status(
|
||||
file_ids: list[int], db_session: Session
|
||||
) -> dict[int, bool]:
|
||||
"""Get indexing status for multiple user files"""
|
||||
status_dict = {}
|
||||
|
||||
# Query UserFile with cc_pair join
|
||||
files_with_pairs = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id.in_(file_ids))
|
||||
.options(joinedload(UserFile.cc_pair))
|
||||
.all()
|
||||
)
|
||||
|
||||
for file in files_with_pairs:
|
||||
if file.cc_pair and file.cc_pair.last_successful_index_time:
|
||||
status_dict[file.id] = True
|
||||
else:
|
||||
status_dict[file.id] = False
|
||||
|
||||
return status_dict
|
||||
|
||||
|
||||
def calculate_user_files_token_count(
|
||||
file_ids: list[int], folder_ids: list[int], db_session: Session
|
||||
) -> int:
|
||||
"""Calculate total token count for specified files and folders"""
|
||||
total_tokens = 0
|
||||
|
||||
# Get tokens from individual files
|
||||
if file_ids:
|
||||
file_tokens = (
|
||||
db_session.query(func.sum(UserFile.token_count))
|
||||
.filter(UserFile.id.in_(file_ids))
|
||||
.scalar()
|
||||
or 0
|
||||
)
|
||||
total_tokens += file_tokens
|
||||
|
||||
# Get tokens from folders
|
||||
if folder_ids:
|
||||
folder_files_tokens = (
|
||||
db_session.query(func.sum(UserFile.token_count))
|
||||
.filter(UserFile.folder_id.in_(folder_ids))
|
||||
.scalar()
|
||||
or 0
|
||||
)
|
||||
total_tokens += folder_files_tokens
|
||||
|
||||
return total_tokens
|
||||
|
||||
|
||||
def load_all_user_files(
|
||||
file_ids: list[int], folder_ids: list[int], db_session: Session
|
||||
) -> list[UserFile]:
|
||||
"""Load all user files from specified file IDs and folder IDs"""
|
||||
result = []
|
||||
|
||||
# Get individual files
|
||||
if file_ids:
|
||||
files = db_session.query(UserFile).filter(UserFile.id.in_(file_ids)).all()
|
||||
result.extend(files)
|
||||
|
||||
# Get files from folders
|
||||
if folder_ids:
|
||||
folder_files = (
|
||||
db_session.query(UserFile).filter(UserFile.folder_id.in_(folder_ids)).all()
|
||||
)
|
||||
result.extend(folder_files)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_user_files_from_folder(folder_id: int, db_session: Session) -> list[UserFile]:
|
||||
return db_session.query(UserFile).filter(UserFile.folder_id == folder_id).all()
|
||||
|
||||
|
||||
def share_file_with_assistant(
|
||||
file_id: int, assistant_id: int, db_session: Session
|
||||
) -> None:
|
||||
file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
|
||||
assistant = db_session.query(Persona).filter(Persona.id == assistant_id).first()
|
||||
|
||||
if file and assistant:
|
||||
file.assistants.append(assistant)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def unshare_file_with_assistant(
|
||||
file_id: int, assistant_id: int, db_session: Session
|
||||
) -> None:
|
||||
db_session.query(Persona__UserFile).filter(
|
||||
and_(
|
||||
Persona__UserFile.user_file_id == file_id,
|
||||
Persona__UserFile.persona_id == assistant_id,
|
||||
)
|
||||
).delete()
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def share_folder_with_assistant(
|
||||
folder_id: int, assistant_id: int, db_session: Session
|
||||
) -> None:
|
||||
folder = db_session.query(UserFolder).filter(UserFolder.id == folder_id).first()
|
||||
assistant = db_session.query(Persona).filter(Persona.id == assistant_id).first()
|
||||
|
||||
if folder and assistant:
|
||||
for file in folder.files:
|
||||
share_file_with_assistant(file.id, assistant_id, db_session)
|
||||
|
||||
|
||||
def unshare_folder_with_assistant(
|
||||
folder_id: int, assistant_id: int, db_session: Session
|
||||
) -> None:
|
||||
folder = db_session.query(UserFolder).filter(UserFolder.id == folder_id).first()
|
||||
|
||||
if folder:
|
||||
for file in folder.files:
|
||||
unshare_file_with_assistant(file.id, assistant_id, db_session)
|
||||
|
||||
|
||||
def fetch_user_files_for_documents(
|
||||
document_ids: list[str],
|
||||
db_session: Session,
|
||||
) -> dict[str, int | None]:
|
||||
"""
|
||||
Fetches user file IDs for the given document IDs.
|
||||
|
||||
Args:
|
||||
document_ids: List of document IDs to fetch user files for
|
||||
db_session: Database session
|
||||
|
||||
Returns:
|
||||
Dictionary mapping document IDs to user file IDs (or None if no user file exists)
|
||||
"""
|
||||
# First, get the document to cc_pair mapping
|
||||
doc_cc_pairs = (
|
||||
db_session.query(Document.id, ConnectorCredentialPair.id)
|
||||
.join(
|
||||
DocumentByConnectorCredentialPair,
|
||||
Document.id == DocumentByConnectorCredentialPair.id,
|
||||
)
|
||||
.join(
|
||||
ConnectorCredentialPair,
|
||||
and_(
|
||||
DocumentByConnectorCredentialPair.connector_id
|
||||
== ConnectorCredentialPair.connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id
|
||||
== ConnectorCredentialPair.credential_id,
|
||||
),
|
||||
)
|
||||
.filter(Document.id.in_(document_ids))
|
||||
.all()
|
||||
)
|
||||
|
||||
# Get cc_pair to user_file mapping
|
||||
cc_pair_to_user_file = (
|
||||
db_session.query(ConnectorCredentialPair.id, UserFile.id)
|
||||
.join(UserFile, UserFile.cc_pair_id == ConnectorCredentialPair.id)
|
||||
.filter(
|
||||
ConnectorCredentialPair.id.in_(
|
||||
[cc_pair_id for _, cc_pair_id in doc_cc_pairs]
|
||||
)
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Create mapping from cc_pair_id to user_file_id
|
||||
cc_pair_to_user_file_dict = {
|
||||
cc_pair_id: user_file_id for cc_pair_id, user_file_id in cc_pair_to_user_file
|
||||
}
|
||||
|
||||
# Create the final result mapping document_id to user_file_id
|
||||
result: dict[str, int | None] = {doc_id: None for doc_id in document_ids}
|
||||
for doc_id, cc_pair_id in doc_cc_pairs:
|
||||
if cc_pair_id in cc_pair_to_user_file_dict:
|
||||
result[doc_id] = cc_pair_to_user_file_dict[cc_pair_id]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def fetch_user_folders_for_documents(
|
||||
document_ids: list[str],
|
||||
db_session: Session,
|
||||
) -> dict[str, int | None]:
|
||||
"""
|
||||
Fetches user folder IDs for the given document IDs.
|
||||
|
||||
For each document, returns the folder ID that the document's associated user file belongs to.
|
||||
|
||||
Args:
|
||||
document_ids: List of document IDs to fetch user folders for
|
||||
db_session: Database session
|
||||
|
||||
Returns:
|
||||
Dictionary mapping document IDs to user folder IDs (or None if no user folder exists)
|
||||
"""
|
||||
# First, get the document to cc_pair mapping
|
||||
doc_cc_pairs = (
|
||||
db_session.query(Document.id, ConnectorCredentialPair.id)
|
||||
.join(
|
||||
DocumentByConnectorCredentialPair,
|
||||
Document.id == DocumentByConnectorCredentialPair.id,
|
||||
)
|
||||
.join(
|
||||
ConnectorCredentialPair,
|
||||
and_(
|
||||
DocumentByConnectorCredentialPair.connector_id
|
||||
== ConnectorCredentialPair.connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id
|
||||
== ConnectorCredentialPair.credential_id,
|
||||
),
|
||||
)
|
||||
.filter(Document.id.in_(document_ids))
|
||||
.all()
|
||||
)
|
||||
|
||||
# Get cc_pair to user_file and folder mapping
|
||||
cc_pair_to_folder = (
|
||||
db_session.query(ConnectorCredentialPair.id, UserFile.folder_id)
|
||||
.join(UserFile, UserFile.cc_pair_id == ConnectorCredentialPair.id)
|
||||
.filter(
|
||||
ConnectorCredentialPair.id.in_(
|
||||
[cc_pair_id for _, cc_pair_id in doc_cc_pairs]
|
||||
)
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Create mapping from cc_pair_id to folder_id
|
||||
cc_pair_to_folder_dict = {
|
||||
cc_pair_id: folder_id for cc_pair_id, folder_id in cc_pair_to_folder
|
||||
}
|
||||
|
||||
# Create the final result mapping document_id to folder_id
|
||||
result: dict[str, int | None] = {doc_id: None for doc_id in document_ids}
|
||||
for doc_id, cc_pair_id in doc_cc_pairs:
|
||||
if cc_pair_id in cc_pair_to_folder_dict:
|
||||
result[doc_id] = cc_pair_to_folder_dict[cc_pair_id]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_user_file_from_id(db_session: Session, user_file_id: int) -> UserFile | None:
|
||||
return db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
|
||||
|
||||
|
||||
# def fetch_user_files_for_documents(
|
||||
# # document_ids: list[str],
|
||||
# # db_session: Session,
|
||||
# # ) -> dict[str, int | None]:
|
||||
# # # Query UserFile objects for the given document_ids
|
||||
# # user_files = (
|
||||
# # db_session.query(UserFile).filter(UserFile.document_id.in_(document_ids)).all()
|
||||
# # )
|
||||
|
||||
# # # Create a dictionary mapping document_ids to UserFile objects
|
||||
# # result: dict[str, int | None] = {doc_id: None for doc_id in document_ids}
|
||||
# # for user_file in user_files:
|
||||
# # result[user_file.document_id] = user_file.id
|
||||
|
||||
# # return result
|
||||
|
||||
|
||||
def upsert_user_folder(
|
||||
db_session: Session,
|
||||
id: int | None = None,
|
||||
user_id: UUID | None = None,
|
||||
name: str | None = None,
|
||||
description: str | None = None,
|
||||
created_at: datetime.datetime | None = None,
|
||||
user: User | None = None,
|
||||
files: list[UserFile] | None = None,
|
||||
assistants: list[Persona] | None = None,
|
||||
) -> UserFolder:
|
||||
if id is not None:
|
||||
user_folder = db_session.query(UserFolder).filter_by(id=id).first()
|
||||
else:
|
||||
user_folder = (
|
||||
db_session.query(UserFolder).filter_by(name=name, user_id=user_id).first()
|
||||
)
|
||||
|
||||
if user_folder:
|
||||
if user_id is not None:
|
||||
user_folder.user_id = user_id
|
||||
if name is not None:
|
||||
user_folder.name = name
|
||||
if description is not None:
|
||||
user_folder.description = description
|
||||
if created_at is not None:
|
||||
user_folder.created_at = created_at
|
||||
if user is not None:
|
||||
user_folder.user = user
|
||||
if files is not None:
|
||||
user_folder.files = files
|
||||
if assistants is not None:
|
||||
user_folder.assistants = assistants
|
||||
else:
|
||||
user_folder = UserFolder(
|
||||
id=id,
|
||||
user_id=user_id,
|
||||
name=name,
|
||||
description=description,
|
||||
created_at=created_at or datetime.datetime.utcnow(),
|
||||
user=user,
|
||||
files=files or [],
|
||||
assistants=assistants or [],
|
||||
)
|
||||
db_session.add(user_folder)
|
||||
|
||||
db_session.flush()
|
||||
return user_folder
|
||||
|
||||
|
||||
def get_user_folder_by_name(db_session: Session, name: str) -> UserFolder | None:
|
||||
return db_session.query(UserFolder).filter(UserFolder.name == name).first()
|
||||
|
||||
|
||||
def update_user_file_token_count__no_commit(
|
||||
user_file_id_to_token_count: dict[int, int | None],
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
for user_file_id, token_count in user_file_id_to_token_count.items():
|
||||
db_session.query(UserFile).filter(UserFile.id == user_file_id).update(
|
||||
{UserFile.token_count: token_count}
|
||||
)
|
@ -104,6 +104,16 @@ class VespaDocumentFields:
|
||||
aggregated_chunk_boost_factor: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class VespaDocumentUserFields:
|
||||
"""
|
||||
Fields that are specific to the user who is indexing the document.
|
||||
"""
|
||||
|
||||
user_file_id: str | None = None
|
||||
user_folder_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpdateRequest:
|
||||
"""
|
||||
@ -258,7 +268,8 @@ class Updatable(abc.ABC):
|
||||
*,
|
||||
tenant_id: str,
|
||||
chunk_count: int | None,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
) -> int:
|
||||
"""
|
||||
Updates all chunks for a document with the specified fields.
|
||||
|
@ -120,12 +120,22 @@ schema DANSWER_CHUNK_NAME {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
field document_sets type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field user_file type int {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field user_folder type int {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
|
||||
# If using different tokenization settings, the fieldset has to be removed, and the field must
|
||||
|
@ -36,6 +36,7 @@ from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
|
||||
from onyx.document_index.interfaces import UpdateRequest
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.document_index.interfaces import VespaDocumentUserFields
|
||||
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
parallel_visit_api_retrieval,
|
||||
@ -70,6 +71,8 @@ from onyx.document_index.vespa_constants import NUM_THREADS
|
||||
from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
|
||||
from onyx.document_index.vespa_constants import VESPA_TIMEOUT
|
||||
@ -592,7 +595,8 @@ class VespaIndex(DocumentIndex):
|
||||
self,
|
||||
doc_chunk_id: UUID,
|
||||
index_name: str,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
doc_id: str,
|
||||
http_client: httpx.Client,
|
||||
) -> None:
|
||||
@ -603,21 +607,31 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
if fields is not None:
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
|
||||
if user_fields is not None:
|
||||
if user_fields.user_file_id is not None:
|
||||
update_dict["fields"][USER_FILE] = {"assign": user_fields.user_file_id}
|
||||
|
||||
if user_fields.user_folder_id is not None:
|
||||
update_dict["fields"][USER_FOLDER] = {
|
||||
"assign": user_fields.user_folder_id
|
||||
}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update.")
|
||||
@ -649,7 +663,8 @@ class VespaIndex(DocumentIndex):
|
||||
*,
|
||||
chunk_count: int | None,
|
||||
tenant_id: str,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
) -> int:
|
||||
"""Note: if the document id does not exist, the update will be a no-op and the
|
||||
function will complete with no errors or exceptions.
|
||||
@ -682,7 +697,12 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
for doc_chunk_id in doc_chunk_ids:
|
||||
self._update_single_chunk(
|
||||
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
||||
doc_chunk_id,
|
||||
index_name,
|
||||
fields,
|
||||
user_fields,
|
||||
doc_id,
|
||||
httpx_client,
|
||||
)
|
||||
|
||||
return doc_chunk_count
|
||||
@ -723,6 +743,7 @@ class VespaIndex(DocumentIndex):
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=large_chunks_enabled,
|
||||
)
|
||||
|
||||
for doc_chunk_ids_batch in batch_generator(
|
||||
chunks_to_delete, BATCH_SIZE
|
||||
):
|
||||
|
@ -51,6 +51,8 @@ from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -205,6 +207,8 @@ def _index_vespa_chunk(
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
|
||||
IMAGE_FILE_NAME: chunk.image_file_name,
|
||||
USER_FILE: chunk.user_file if chunk.user_file is not None else None,
|
||||
USER_FOLDER: chunk.user_folder if chunk.user_folder is not None else None,
|
||||
BOOST: chunk.boost,
|
||||
AGGREGATED_CHUNK_BOOST_FACTOR: chunk.aggregated_chunk_boost_factor,
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ from datetime import timezone
|
||||
from onyx.configs.constants import INDEX_SEPARATOR
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
@ -14,6 +13,8 @@ from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
@ -27,14 +28,26 @@ def build_vespa_filters(
|
||||
remove_trailing_and: bool = False, # Set to True when using as a complete Vespa query
|
||||
) -> str:
|
||||
def _build_or_filters(key: str, vals: list[str] | None) -> str:
|
||||
if vals is None:
|
||||
"""For string-based 'contains' filters, e.g. WSET fields or array<string> fields."""
|
||||
if not key or not vals:
|
||||
return ""
|
||||
eq_elems = [f'{key} contains "{val}"' for val in vals if val]
|
||||
if not eq_elems:
|
||||
return ""
|
||||
or_clause = " or ".join(eq_elems)
|
||||
return f"({or_clause}) and "
|
||||
|
||||
def _build_int_or_filters(key: str, vals: list[int] | None) -> str:
|
||||
"""
|
||||
For an integer field filter.
|
||||
If vals is not None, we want *only* docs whose key matches one of vals.
|
||||
"""
|
||||
# If `vals` is None => skip the filter entirely
|
||||
if vals is None or not vals:
|
||||
return ""
|
||||
|
||||
valid_vals = [val for val in vals if val]
|
||||
if not key or not valid_vals:
|
||||
return ""
|
||||
|
||||
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
|
||||
# Otherwise build the OR filter
|
||||
eq_elems = [f"{key} = {val}" for val in vals]
|
||||
or_clause = " or ".join(eq_elems)
|
||||
result = f"({or_clause}) and "
|
||||
|
||||
@ -42,53 +55,57 @@ def build_vespa_filters(
|
||||
|
||||
def _build_time_filter(
|
||||
cutoff: datetime | None,
|
||||
# Slightly over 3 Months, approximately 1 fiscal quarter
|
||||
untimed_doc_cutoff: timedelta = timedelta(days=92),
|
||||
) -> str:
|
||||
if not cutoff:
|
||||
return ""
|
||||
|
||||
# For Documents that don't have an updated at, filter them out for queries asking for
|
||||
# very recent documents (3 months) default. Documents that don't have an updated at
|
||||
# time are assigned 3 months for time decay value
|
||||
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
|
||||
cutoff_secs = int(cutoff.timestamp())
|
||||
|
||||
if include_untimed:
|
||||
# Documents without updated_at are assigned -1 as their date
|
||||
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
|
||||
|
||||
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
|
||||
|
||||
# Start building the filter string
|
||||
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
|
||||
|
||||
# If running in multi-tenant mode, we may want to filter by tenant_id
|
||||
# If running in multi-tenant mode
|
||||
if filters.tenant_id and MULTI_TENANT:
|
||||
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
|
||||
|
||||
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
|
||||
if filters.access_control_list is not None:
|
||||
filter_str += _build_or_filters(
|
||||
ACCESS_CONTROL_LIST, filters.access_control_list
|
||||
)
|
||||
# ACL filters
|
||||
# if filters.access_control_list is not None:
|
||||
# filter_str += _build_or_filters(ACCESS_CONTROL_LIST, filters.access_control_list)
|
||||
|
||||
# Source type filters
|
||||
source_strs = (
|
||||
[s.value for s in filters.source_type] if filters.source_type else None
|
||||
)
|
||||
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
|
||||
|
||||
# Tag filters
|
||||
tag_attributes = None
|
||||
tags = filters.tags
|
||||
if tags:
|
||||
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
|
||||
if filters.tags:
|
||||
# build e.g. "tag_key|tag_value"
|
||||
tag_attributes = [
|
||||
f"{tag.tag_key}{INDEX_SEPARATOR}{tag.tag_value}" for tag in filters.tags
|
||||
]
|
||||
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
|
||||
|
||||
# Document sets
|
||||
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
|
||||
|
||||
# New: user_file_ids as integer filters
|
||||
filter_str += _build_int_or_filters(USER_FILE, filters.user_file_ids)
|
||||
|
||||
filter_str += _build_int_or_filters(USER_FOLDER, filters.user_folder_ids)
|
||||
|
||||
# Time filter
|
||||
filter_str += _build_time_filter(filters.time_cutoff)
|
||||
|
||||
# Trim trailing " and "
|
||||
if remove_trailing_and and filter_str.endswith(" and "):
|
||||
filter_str = filter_str[:-5] # We remove the trailing " and "
|
||||
filter_str = filter_str[:-5]
|
||||
|
||||
return filter_str
|
||||
|
||||
|
@ -67,6 +67,8 @@ EMBEDDINGS = "embeddings"
|
||||
TITLE_EMBEDDING = "title_embedding"
|
||||
ACCESS_CONTROL_LIST = "access_control_list"
|
||||
DOCUMENT_SETS = "document_sets"
|
||||
USER_FILE = "user_file"
|
||||
USER_FOLDER = "user_folder"
|
||||
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
|
||||
METADATA = "metadata"
|
||||
METADATA_LIST = "metadata_list"
|
||||
|
@ -37,6 +37,7 @@ def delete_unstructured_api_key() -> None:
|
||||
def _sdk_partition_request(
|
||||
file: IO[Any], file_name: str, **kwargs: Any
|
||||
) -> operations.PartitionRequest:
|
||||
file.seek(0, 0)
|
||||
try:
|
||||
request = operations.PartitionRequest(
|
||||
partition_parameters=shared.PartitionParameters(
|
||||
|
@ -31,6 +31,7 @@ class FileStore(ABC):
|
||||
file_origin: FileOrigin,
|
||||
file_type: str,
|
||||
file_metadata: dict | None = None,
|
||||
commit: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Save a file to the blob store
|
||||
@ -42,6 +43,8 @@ class FileStore(ABC):
|
||||
- display_name: Display name of the file
|
||||
- file_origin: Origin of the file
|
||||
- file_type: Type of the file
|
||||
- file_metadata: Additional metadata for the file
|
||||
- commit: Whether to commit the transaction after saving the file
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@ -90,6 +93,7 @@ class PostgresBackedFileStore(FileStore):
|
||||
file_origin: FileOrigin,
|
||||
file_type: str,
|
||||
file_metadata: dict | None = None,
|
||||
commit: bool = True,
|
||||
) -> None:
|
||||
try:
|
||||
# The large objects in postgres are saved as special objects can be listed with
|
||||
@ -104,7 +108,8 @@ class PostgresBackedFileStore(FileStore):
|
||||
db_session=self.db_session,
|
||||
file_metadata=file_metadata,
|
||||
)
|
||||
self.db_session.commit()
|
||||
if commit:
|
||||
self.db_session.commit()
|
||||
except Exception:
|
||||
self.db_session.rollback()
|
||||
raise
|
||||
|
@ -14,6 +14,7 @@ class ChatFileType(str, Enum):
|
||||
# Plain text only contain the text
|
||||
PLAIN_TEXT = "plain_text"
|
||||
CSV = "csv"
|
||||
USER_KNOWLEDGE = "user_knowledge"
|
||||
|
||||
|
||||
class FileDescriptor(TypedDict):
|
||||
|
@ -10,12 +10,62 @@ from sqlalchemy.orm import Session
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.models import ChatMessage
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserFolder
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.file_store.models import ChatFileType
|
||||
from onyx.file_store.models import FileDescriptor
|
||||
from onyx.file_store.models import InMemoryChatFile
|
||||
from onyx.utils.b64 import get_image_type
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def user_file_id_to_plaintext_file_name(user_file_id: int) -> str:
|
||||
"""Generate a consistent file name for storing plaintext content of a user file."""
|
||||
return f"plaintext_{user_file_id}"
|
||||
|
||||
|
||||
def store_user_file_plaintext(
|
||||
user_file_id: int, plaintext_content: str, db_session: Session
|
||||
) -> bool:
|
||||
"""
|
||||
Store plaintext content for a user file in the file store.
|
||||
|
||||
Args:
|
||||
user_file_id: The ID of the user file
|
||||
plaintext_content: The plaintext content to store
|
||||
db_session: The database session
|
||||
|
||||
Returns:
|
||||
bool: True if storage was successful, False otherwise
|
||||
"""
|
||||
# Skip empty content
|
||||
if not plaintext_content:
|
||||
return False
|
||||
|
||||
# Get plaintext file name
|
||||
plaintext_file_name = user_file_id_to_plaintext_file_name(user_file_id)
|
||||
|
||||
# Store the plaintext in the file store
|
||||
file_store = get_default_file_store(db_session)
|
||||
file_content = BytesIO(plaintext_content.encode("utf-8"))
|
||||
try:
|
||||
file_store.save_file(
|
||||
file_name=plaintext_file_name,
|
||||
content=file_content,
|
||||
display_name=f"Plaintext for user file {user_file_id}",
|
||||
file_origin=FileOrigin.PLAINTEXT_CACHE,
|
||||
file_type="text/plain",
|
||||
commit=False,
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to store plaintext for user file {user_file_id}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def load_chat_file(
|
||||
file_descriptor: FileDescriptor, db_session: Session
|
||||
@ -53,6 +103,83 @@ def load_all_chat_files(
|
||||
return files
|
||||
|
||||
|
||||
def load_user_folder(folder_id: int, db_session: Session) -> list[InMemoryChatFile]:
|
||||
user_files = (
|
||||
db_session.query(UserFile).filter(UserFile.folder_id == folder_id).all()
|
||||
)
|
||||
return [load_user_file(file.id, db_session) for file in user_files]
|
||||
|
||||
|
||||
def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
|
||||
user_file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
|
||||
if not user_file:
|
||||
raise ValueError(f"User file with id {file_id} not found")
|
||||
|
||||
# Try to load plaintext version first
|
||||
file_store = get_default_file_store(db_session)
|
||||
plaintext_file_name = user_file_id_to_plaintext_file_name(file_id)
|
||||
|
||||
try:
|
||||
file_io = file_store.read_file(plaintext_file_name, mode="b")
|
||||
return InMemoryChatFile(
|
||||
file_id=str(user_file.file_id),
|
||||
content=file_io.read(),
|
||||
file_type=ChatFileType.USER_KNOWLEDGE,
|
||||
filename=user_file.name,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to load plaintext file {plaintext_file_name}, defaulting to original file: {e}"
|
||||
)
|
||||
# Fall back to original file if plaintext not available
|
||||
file_io = file_store.read_file(user_file.file_id, mode="b")
|
||||
return InMemoryChatFile(
|
||||
file_id=str(user_file.file_id),
|
||||
content=file_io.read(),
|
||||
file_type=ChatFileType.USER_KNOWLEDGE,
|
||||
filename=user_file.name,
|
||||
)
|
||||
|
||||
|
||||
def load_all_user_files(
|
||||
user_file_ids: list[int],
|
||||
user_folder_ids: list[int],
|
||||
db_session: Session,
|
||||
) -> list[InMemoryChatFile]:
|
||||
return cast(
|
||||
list[InMemoryChatFile],
|
||||
run_functions_tuples_in_parallel(
|
||||
[(load_user_file, (file_id, db_session)) for file_id in user_file_ids]
|
||||
)
|
||||
+ [
|
||||
file
|
||||
for folder_id in user_folder_ids
|
||||
for file in load_user_folder(folder_id, db_session)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def load_all_user_file_files(
|
||||
user_file_ids: list[int],
|
||||
user_folder_ids: list[int],
|
||||
db_session: Session,
|
||||
) -> list[UserFile]:
|
||||
user_files: list[UserFile] = []
|
||||
for user_file_id in user_file_ids:
|
||||
user_file = (
|
||||
db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
|
||||
)
|
||||
if user_file is not None:
|
||||
user_files.append(user_file)
|
||||
for user_folder_id in user_folder_ids:
|
||||
user_files.extend(
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.folder_id == user_folder_id)
|
||||
.all()
|
||||
)
|
||||
return user_files
|
||||
|
||||
|
||||
def save_file_from_url(url: str) -> str:
|
||||
"""NOTE: using multiple sessions here, since this is often called
|
||||
using multithreading. In practice, sharing a session has resulted in
|
||||
@ -71,6 +198,7 @@ def save_file_from_url(url: str) -> str:
|
||||
display_name="GeneratedImage",
|
||||
file_origin=FileOrigin.CHAT_IMAGE_GEN,
|
||||
file_type="image/png;base64",
|
||||
commit=True,
|
||||
)
|
||||
return unique_id
|
||||
|
||||
@ -85,6 +213,7 @@ def save_file_from_base64(base64_string: str) -> str:
|
||||
display_name="GeneratedImage",
|
||||
file_origin=FileOrigin.CHAT_IMAGE_GEN,
|
||||
file_type=get_image_type(base64_string),
|
||||
commit=True,
|
||||
)
|
||||
return unique_id
|
||||
|
||||
@ -128,3 +257,39 @@ def save_files(urls: list[str], base64_files: list[str]) -> list[str]:
|
||||
]
|
||||
|
||||
return run_functions_tuples_in_parallel(funcs)
|
||||
|
||||
|
||||
def load_all_persona_files_for_chat(
|
||||
persona_id: int, db_session: Session
|
||||
) -> tuple[list[InMemoryChatFile], list[int]]:
|
||||
from onyx.db.models import Persona
|
||||
from sqlalchemy.orm import joinedload
|
||||
|
||||
persona = (
|
||||
db_session.query(Persona)
|
||||
.filter(Persona.id == persona_id)
|
||||
.options(
|
||||
joinedload(Persona.user_files),
|
||||
joinedload(Persona.user_folders).joinedload(UserFolder.files),
|
||||
)
|
||||
.one()
|
||||
)
|
||||
|
||||
persona_file_calls = [
|
||||
(load_user_file, (user_file.id, db_session)) for user_file in persona.user_files
|
||||
]
|
||||
persona_loaded_files = run_functions_tuples_in_parallel(persona_file_calls)
|
||||
|
||||
persona_folder_files = []
|
||||
persona_folder_file_ids = []
|
||||
for user_folder in persona.user_folders:
|
||||
folder_files = load_user_folder(user_folder.id, db_session)
|
||||
persona_folder_files.extend(folder_files)
|
||||
persona_folder_file_ids.extend([file.id for file in user_folder.files])
|
||||
|
||||
persona_files = list(persona_loaded_files) + persona_folder_files
|
||||
persona_file_ids = [
|
||||
file.id for file in persona.user_files
|
||||
] + persona_folder_file_ids
|
||||
|
||||
return persona_files, persona_file_ids
|
||||
|
@ -49,6 +49,9 @@ from onyx.db.pg_file_store import read_lobj
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.tag import create_or_add_document_tag
|
||||
from onyx.db.tag import create_or_add_document_tag_list
|
||||
from onyx.db.user_documents import fetch_user_files_for_documents
|
||||
from onyx.db.user_documents import fetch_user_folders_for_documents
|
||||
from onyx.db.user_documents import update_user_file_token_count__no_commit
|
||||
from onyx.document_index.document_index_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
@ -56,6 +59,7 @@ from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentMetadata
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.file_store.utils import store_user_file_plaintext
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.indexing.embedder import embed_chunks_with_failure_handling
|
||||
from onyx.indexing.embedder import IndexingEmbedder
|
||||
@ -67,6 +71,7 @@ from onyx.indexing.models import UpdatableChunkData
|
||||
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
|
||||
from onyx.llm.chat_llm import LLMRateLimitError
|
||||
from onyx.llm.factory import get_default_llm_with_vision
|
||||
from onyx.llm.factory import get_default_llms
|
||||
from onyx.llm.factory import get_llm_for_contextual_rag
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.utils import get_max_input_tokens
|
||||
@ -769,6 +774,7 @@ def index_doc_batch(
|
||||
# NOTE: no special handling for failures here, since the chunker is not
|
||||
# a common source of failure for the indexing pipeline
|
||||
chunks: list[DocAwareChunk] = chunker.chunk(ctx.indexable_docs)
|
||||
llm_tokenizer: BaseTokenizer | None = None
|
||||
|
||||
# contextual RAG
|
||||
if enable_contextual_rag:
|
||||
@ -826,6 +832,15 @@ def index_doc_batch(
|
||||
)
|
||||
}
|
||||
|
||||
doc_id_to_user_file_id: dict[str, int | None] = fetch_user_files_for_documents(
|
||||
document_ids=updatable_ids, db_session=db_session
|
||||
)
|
||||
doc_id_to_user_folder_id: dict[
|
||||
str, int | None
|
||||
] = fetch_user_folders_for_documents(
|
||||
document_ids=updatable_ids, db_session=db_session
|
||||
)
|
||||
|
||||
doc_id_to_previous_chunk_cnt: dict[str, int | None] = {
|
||||
document_id: chunk_count
|
||||
for document_id, chunk_count in fetch_chunk_counts_for_documents(
|
||||
@ -845,6 +860,48 @@ def index_doc_batch(
|
||||
for document_id in updatable_ids
|
||||
}
|
||||
|
||||
try:
|
||||
llm, _ = get_default_llms()
|
||||
|
||||
llm_tokenizer = get_tokenizer(
|
||||
model_name=llm.config.model_name,
|
||||
provider_type=llm.config.model_provider,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting tokenizer: {e}")
|
||||
llm_tokenizer = None
|
||||
|
||||
# Calculate token counts for each document by combining all its chunks' content
|
||||
user_file_id_to_token_count: dict[int, int | None] = {}
|
||||
user_file_id_to_raw_text: dict[int, str] = {}
|
||||
for document_id in updatable_ids:
|
||||
# Only calculate token counts for documents that have a user file ID
|
||||
if (
|
||||
document_id in doc_id_to_user_file_id
|
||||
and doc_id_to_user_file_id[document_id] is not None
|
||||
):
|
||||
user_file_id = doc_id_to_user_file_id[document_id]
|
||||
if not user_file_id:
|
||||
continue
|
||||
document_chunks = [
|
||||
chunk
|
||||
for chunk in chunks_with_embeddings
|
||||
if chunk.source_document.id == document_id
|
||||
]
|
||||
if document_chunks:
|
||||
combined_content = " ".join(
|
||||
[chunk.content for chunk in document_chunks]
|
||||
)
|
||||
token_count = (
|
||||
len(llm_tokenizer.encode(combined_content))
|
||||
if llm_tokenizer
|
||||
else 0
|
||||
)
|
||||
user_file_id_to_token_count[user_file_id] = token_count
|
||||
user_file_id_to_raw_text[user_file_id] = combined_content
|
||||
else:
|
||||
user_file_id_to_token_count[user_file_id] = None
|
||||
|
||||
# we're concerned about race conditions where multiple simultaneous indexings might result
|
||||
# in one set of metadata overwriting another one in vespa.
|
||||
# we still write data here for the immediate and most likely correct sync, but
|
||||
@ -857,6 +914,10 @@ def index_doc_batch(
|
||||
document_sets=set(
|
||||
doc_id_to_document_set.get(chunk.source_document.id, [])
|
||||
),
|
||||
user_file=doc_id_to_user_file_id.get(chunk.source_document.id, None),
|
||||
user_folder=doc_id_to_user_folder_id.get(
|
||||
chunk.source_document.id, None
|
||||
),
|
||||
boost=(
|
||||
ctx.id_to_db_doc_map[chunk.source_document.id].boost
|
||||
if chunk.source_document.id in ctx.id_to_db_doc_map
|
||||
@ -938,6 +999,11 @@ def index_doc_batch(
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
update_user_file_token_count__no_commit(
|
||||
user_file_id_to_token_count=user_file_id_to_token_count,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# these documents can now be counted as part of the CC Pairs
|
||||
# document count, so we need to mark them as indexed
|
||||
# NOTE: even documents we skipped since they were already up
|
||||
@ -949,12 +1015,22 @@ def index_doc_batch(
|
||||
document_ids=[doc.id for doc in filtered_documents],
|
||||
db_session=db_session,
|
||||
)
|
||||
# Store the plaintext in the file store for faster retrieval
|
||||
for user_file_id, raw_text in user_file_id_to_raw_text.items():
|
||||
# Use the dedicated function to store plaintext
|
||||
store_user_file_plaintext(
|
||||
user_file_id=user_file_id,
|
||||
plaintext_content=raw_text,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# save the chunk boost components to postgres
|
||||
update_chunk_boost_components__no_commit(
|
||||
chunk_data=updatable_chunk_data, db_session=db_session
|
||||
)
|
||||
|
||||
# Pause user file ccpairs
|
||||
|
||||
db_session.commit()
|
||||
|
||||
result = IndexingPipelineResult(
|
||||
|
@ -100,6 +100,8 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
||||
tenant_id: str
|
||||
access: "DocumentAccess"
|
||||
document_sets: set[str]
|
||||
user_file: int | None
|
||||
user_folder: int | None
|
||||
boost: int
|
||||
aggregated_chunk_boost_factor: float
|
||||
|
||||
@ -109,6 +111,8 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
||||
index_chunk: IndexChunk,
|
||||
access: "DocumentAccess",
|
||||
document_sets: set[str],
|
||||
user_file: int | None,
|
||||
user_folder: int | None,
|
||||
boost: int,
|
||||
aggregated_chunk_boost_factor: float,
|
||||
tenant_id: str,
|
||||
@ -118,6 +122,8 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
||||
**index_chunk_data,
|
||||
access=access,
|
||||
document_sets=document_sets,
|
||||
user_file=user_file,
|
||||
user_folder=user_folder,
|
||||
boost=boost,
|
||||
aggregated_chunk_boost_factor=aggregated_chunk_boost_factor,
|
||||
tenant_id=tenant_id,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import copy
|
||||
import io
|
||||
import json
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
@ -37,6 +38,7 @@ from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
from onyx.configs.model_configs import GEN_AI_MAX_TOKENS
|
||||
from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
|
||||
from onyx.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_store.models import ChatFileType
|
||||
from onyx.file_store.models import InMemoryChatFile
|
||||
from onyx.llm.interfaces import LLM
|
||||
@ -129,7 +131,12 @@ def _build_content(
|
||||
text_files = [
|
||||
file
|
||||
for file in files
|
||||
if file.file_type in (ChatFileType.PLAIN_TEXT, ChatFileType.CSV)
|
||||
if file.file_type
|
||||
in (
|
||||
ChatFileType.PLAIN_TEXT,
|
||||
ChatFileType.CSV,
|
||||
ChatFileType.USER_KNOWLEDGE,
|
||||
)
|
||||
]
|
||||
|
||||
if not text_files:
|
||||
@ -137,7 +144,18 @@ def _build_content(
|
||||
|
||||
final_message_with_files = "FILES:\n\n"
|
||||
for file in text_files:
|
||||
file_content = file.content.decode("utf-8")
|
||||
try:
|
||||
file_content = file.content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
# Try to decode as binary
|
||||
try:
|
||||
file_content, _, _ = read_pdf_file(io.BytesIO(file.content))
|
||||
except Exception:
|
||||
file_content = f"[Binary file content - {file.file_type} format]"
|
||||
logger.exception(
|
||||
f"Could not decode binary file content for file type: {file.file_type}"
|
||||
)
|
||||
# logger.warning(f"Could not decode binary file content for file type: {file.file_type}")
|
||||
file_name_section = f"DOCUMENT: {file.filename}\n" if file.filename else ""
|
||||
final_message_with_files += (
|
||||
f"{file_name_section}{CODE_BLOCK_PAT.format(file_content.strip())}\n\n\n"
|
||||
@ -165,7 +183,6 @@ def build_content_with_imgs(
|
||||
|
||||
img_urls = img_urls or []
|
||||
b64_imgs = b64_imgs or []
|
||||
|
||||
message_main_content = _build_content(message, files)
|
||||
|
||||
if exclude_images or (not img_files and not img_urls):
|
||||
@ -413,14 +430,12 @@ def _find_model_obj(model_map: dict, provider: str, model_name: str) -> dict | N
|
||||
for model_name in filtered_model_names:
|
||||
model_obj = model_map.get(f"{provider}/{model_name}")
|
||||
if model_obj:
|
||||
logger.debug(f"Using model object for {provider}/{model_name}")
|
||||
return model_obj
|
||||
|
||||
# Then try all model names without provider prefix
|
||||
for model_name in filtered_model_names:
|
||||
model_obj = model_map.get(model_name)
|
||||
if model_obj:
|
||||
logger.debug(f"Using model object for {model_name}")
|
||||
return model_obj
|
||||
|
||||
return None
|
||||
@ -516,14 +531,10 @@ def get_llm_max_tokens(
|
||||
|
||||
if "max_input_tokens" in model_obj:
|
||||
max_tokens = model_obj["max_input_tokens"]
|
||||
logger.debug(
|
||||
f"Max tokens for {model_name}: {max_tokens} (from max_input_tokens)"
|
||||
)
|
||||
return max_tokens
|
||||
|
||||
if "max_tokens" in model_obj:
|
||||
max_tokens = model_obj["max_tokens"]
|
||||
logger.debug(f"Max tokens for {model_name}: {max_tokens} (from max_tokens)")
|
||||
return max_tokens
|
||||
|
||||
logger.error(f"No max tokens found for LLM: {model_name}")
|
||||
@ -545,21 +556,16 @@ def get_llm_max_output_tokens(
|
||||
model_obj = model_map.get(f"{model_provider}/{model_name}")
|
||||
if not model_obj:
|
||||
model_obj = model_map[model_name]
|
||||
logger.debug(f"Using model object for {model_name}")
|
||||
else:
|
||||
logger.debug(f"Using model object for {model_provider}/{model_name}")
|
||||
pass
|
||||
|
||||
if "max_output_tokens" in model_obj:
|
||||
max_output_tokens = model_obj["max_output_tokens"]
|
||||
logger.info(f"Max output tokens for {model_name}: {max_output_tokens}")
|
||||
return max_output_tokens
|
||||
|
||||
# Fallback to a fraction of max_tokens if max_output_tokens is not specified
|
||||
if "max_tokens" in model_obj:
|
||||
max_output_tokens = int(model_obj["max_tokens"] * 0.1)
|
||||
logger.info(
|
||||
f"Fallback max output tokens for {model_name}: {max_output_tokens} (10% of max_tokens)"
|
||||
)
|
||||
return max_output_tokens
|
||||
|
||||
logger.error(f"No max output tokens found for LLM: {model_name}")
|
||||
|
@ -97,6 +97,7 @@ from onyx.server.settings.api import basic_router as settings_router
|
||||
from onyx.server.token_rate_limits.api import (
|
||||
router as token_rate_limit_settings_router,
|
||||
)
|
||||
from onyx.server.user_documents.api import router as user_documents_router
|
||||
from onyx.server.utils import BasicAuthenticationError
|
||||
from onyx.setup import setup_multitenant_onyx
|
||||
from onyx.setup import setup_onyx
|
||||
@ -297,6 +298,7 @@ def get_application() -> FastAPI:
|
||||
include_router_with_global_prefix_prepended(application, input_prompt_router)
|
||||
include_router_with_global_prefix_prepended(application, admin_input_prompt_router)
|
||||
include_router_with_global_prefix_prepended(application, cc_pair_router)
|
||||
include_router_with_global_prefix_prepended(application, user_documents_router)
|
||||
include_router_with_global_prefix_prepended(application, folder_router)
|
||||
include_router_with_global_prefix_prepended(application, document_set_router)
|
||||
include_router_with_global_prefix_prepended(application, search_settings_router)
|
||||
|
@ -594,7 +594,7 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
|
||||
bot_tag_id = get_onyx_bot_slack_bot_id(client.web_client)
|
||||
if event_type == "message":
|
||||
is_dm = event.get("channel_type") == "im"
|
||||
is_tagged = bot_tag_id and bot_tag_id in msg
|
||||
is_tagged = bot_tag_id and f"<@{bot_tag_id}>" in msg
|
||||
is_onyx_bot_msg = bot_tag_id and bot_tag_id in event.get("user", "")
|
||||
|
||||
# OnyxBot should never respond to itself
|
||||
@ -727,7 +727,11 @@ def build_request_details(
|
||||
event = cast(dict[str, Any], req.payload["event"])
|
||||
msg = cast(str, event["text"])
|
||||
channel = cast(str, event["channel"])
|
||||
tagged = event.get("type") == "app_mention"
|
||||
# Check for both app_mention events and messages containing bot tag
|
||||
bot_tag_id = get_onyx_bot_slack_bot_id(client.web_client)
|
||||
tagged = (event.get("type") == "app_mention") or (
|
||||
event.get("type") == "message" and bot_tag_id and f"<@{bot_tag_id}>" in msg
|
||||
)
|
||||
message_ts = event.get("ts")
|
||||
thread_ts = event.get("thread_ts")
|
||||
sender_id = event.get("user") or None
|
||||
|
@ -145,7 +145,7 @@ def update_emote_react(
|
||||
|
||||
def remove_onyx_bot_tag(message_str: str, client: WebClient) -> str:
|
||||
bot_tag_id = get_onyx_bot_slack_bot_id(web_client=client)
|
||||
return re.sub(rf"<@{bot_tag_id}>\s", "", message_str)
|
||||
return re.sub(rf"<@{bot_tag_id}>\s*", "", message_str)
|
||||
|
||||
|
||||
def _check_for_url_in_block(block: Block) -> bool:
|
||||
|
@ -98,6 +98,8 @@ def _create_indexable_chunks(
|
||||
tenant_id=tenant_id if MULTI_TENANT else POSTGRES_DEFAULT_SCHEMA,
|
||||
access=default_public_access,
|
||||
document_sets=set(),
|
||||
user_file=None,
|
||||
user_folder=None,
|
||||
boost=DEFAULT_BOOST,
|
||||
large_chunk_id=None,
|
||||
image_file_name=None,
|
||||
|
@ -5,6 +5,7 @@ from onyx.configs.chat_configs import INPUT_PROMPT_YAML
|
||||
from onyx.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
|
||||
from onyx.configs.chat_configs import PERSONAS_YAML
|
||||
from onyx.configs.chat_configs import PROMPTS_YAML
|
||||
from onyx.configs.chat_configs import USER_FOLDERS_YAML
|
||||
from onyx.context.search.enums import RecencyBiasSetting
|
||||
from onyx.db.document_set import get_or_create_document_set_by_name
|
||||
from onyx.db.input_prompt import insert_input_prompt_if_not_exists
|
||||
@ -15,6 +16,29 @@ from onyx.db.models import Tool as ToolDBModel
|
||||
from onyx.db.persona import upsert_persona
|
||||
from onyx.db.prompts import get_prompt_by_name
|
||||
from onyx.db.prompts import upsert_prompt
|
||||
from onyx.db.user_documents import upsert_user_folder
|
||||
|
||||
|
||||
def load_user_folders_from_yaml(
|
||||
db_session: Session,
|
||||
user_folders_yaml: str = USER_FOLDERS_YAML,
|
||||
) -> None:
|
||||
with open(user_folders_yaml, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
all_user_folders = data.get("user_folders", [])
|
||||
for user_folder in all_user_folders:
|
||||
upsert_user_folder(
|
||||
db_session=db_session,
|
||||
id=user_folder.get("id"),
|
||||
name=user_folder.get("name"),
|
||||
description=user_folder.get("description"),
|
||||
created_at=user_folder.get("created_at"),
|
||||
user=user_folder.get("user"),
|
||||
files=user_folder.get("files"),
|
||||
assistants=user_folder.get("assistants"),
|
||||
)
|
||||
db_session.flush()
|
||||
|
||||
|
||||
def load_prompts_from_yaml(
|
||||
@ -179,3 +203,4 @@ def load_chat_yamls(
|
||||
load_prompts_from_yaml(db_session, prompt_yaml)
|
||||
load_personas_from_yaml(db_session, personas_yaml)
|
||||
load_input_prompts_from_yaml(db_session, input_prompts_yaml)
|
||||
load_user_folders_from_yaml(db_session)
|
||||
|
6
backend/onyx/seeding/user_folders.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
user_folders:
|
||||
- id: -1
|
||||
name: "Recent Documents"
|
||||
description: "Documents uploaded by the user"
|
||||
files: []
|
||||
assistants: []
|
@ -389,12 +389,7 @@ def check_drive_tokens(
|
||||
return AuthStatus(authenticated=True)
|
||||
|
||||
|
||||
@router.post("/admin/connector/file/upload")
|
||||
def upload_files(
|
||||
files: list[UploadFile],
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> FileUploadResponse:
|
||||
def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResponse:
|
||||
for file in files:
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="File name cannot be empty")
|
||||
@ -455,6 +450,15 @@ def upload_files(
|
||||
return FileUploadResponse(file_paths=deduped_file_paths)
|
||||
|
||||
|
||||
@router.post("/admin/connector/file/upload")
|
||||
def upload_files_api(
|
||||
files: list[UploadFile],
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> FileUploadResponse:
|
||||
return upload_files(files, db_session)
|
||||
|
||||
|
||||
@router.get("/admin/connector")
|
||||
def get_connectors_by_credential(
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
@ -758,6 +762,16 @@ def get_connector_indexing_status(
|
||||
(connector.id, credential.id)
|
||||
)
|
||||
|
||||
# Safely get the owner email, handling detached instances
|
||||
owner_email = ""
|
||||
try:
|
||||
if credential.user:
|
||||
owner_email = credential.user.email
|
||||
except Exception:
|
||||
# If there's any error accessing the user (like DetachedInstanceError),
|
||||
# we'll just use an empty string for the owner email
|
||||
pass
|
||||
|
||||
indexing_statuses.append(
|
||||
ConnectorIndexingStatus(
|
||||
cc_pair_id=cc_pair.id,
|
||||
@ -769,7 +783,7 @@ def get_connector_indexing_status(
|
||||
),
|
||||
credential=CredentialSnapshot.from_credential_db_model(credential),
|
||||
access_type=cc_pair.access_type,
|
||||
owner=credential.user.email if credential.user else "",
|
||||
owner=owner_email,
|
||||
groups=group_cc_pair_relationships_dict.get(cc_pair.id, []),
|
||||
last_finished_status=(
|
||||
latest_finished_attempt.status if latest_finished_attempt else None
|
||||
@ -1042,55 +1056,16 @@ def connector_run_once(
|
||||
status_code=400,
|
||||
detail="Connector has no valid credentials, cannot create index attempts.",
|
||||
)
|
||||
|
||||
# Prevents index attempts for cc pairs that already have an index attempt currently running
|
||||
skipped_credentials = [
|
||||
credential_id
|
||||
for credential_id in credential_ids
|
||||
if get_index_attempts_for_cc_pair(
|
||||
cc_pair_identifier=ConnectorCredentialPairIdentifier(
|
||||
connector_id=run_info.connector_id,
|
||||
credential_id=credential_id,
|
||||
),
|
||||
only_current=True,
|
||||
db_session=db_session,
|
||||
disinclude_finished=True,
|
||||
try:
|
||||
num_triggers = trigger_indexing_for_cc_pair(
|
||||
credential_ids,
|
||||
connector_id,
|
||||
run_info.from_beginning,
|
||||
tenant_id,
|
||||
db_session,
|
||||
)
|
||||
]
|
||||
|
||||
connector_credential_pairs = [
|
||||
get_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
)
|
||||
for credential_id in credential_ids
|
||||
if credential_id not in skipped_credentials
|
||||
]
|
||||
|
||||
num_triggers = 0
|
||||
for cc_pair in connector_credential_pairs:
|
||||
if cc_pair is not None:
|
||||
indexing_mode = IndexingMode.UPDATE
|
||||
if run_info.from_beginning:
|
||||
indexing_mode = IndexingMode.REINDEX
|
||||
|
||||
mark_ccpair_with_indexing_trigger(cc_pair.id, indexing_mode, db_session)
|
||||
num_triggers += 1
|
||||
|
||||
logger.info(
|
||||
f"connector_run_once - marking cc_pair with indexing trigger: "
|
||||
f"connector={run_info.connector_id} "
|
||||
f"cc_pair={cc_pair.id} "
|
||||
f"indexing_trigger={indexing_mode}"
|
||||
)
|
||||
|
||||
# run the beat task to pick up the triggers immediately
|
||||
primary_app.send_task(
|
||||
OnyxCeleryTask.CHECK_FOR_INDEXING,
|
||||
priority=OnyxCeleryPriority.HIGH,
|
||||
kwargs={"tenant_id": tenant_id},
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
logger.info("connector_run_once - running check_for_indexing")
|
||||
|
||||
@ -1264,3 +1239,85 @@ def get_basic_connector_indexing_status(
|
||||
for cc_pair in cc_pairs
|
||||
if cc_pair.connector.source != DocumentSource.INGESTION_API
|
||||
]
|
||||
|
||||
|
||||
def trigger_indexing_for_cc_pair(
|
||||
specified_credential_ids: list[int],
|
||||
connector_id: int,
|
||||
from_beginning: bool,
|
||||
tenant_id: str,
|
||||
db_session: Session,
|
||||
is_user_file: bool = False,
|
||||
) -> int:
|
||||
try:
|
||||
possible_credential_ids = get_connector_credential_ids(connector_id, db_session)
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Connector by id {connector_id} does not exist: {str(e)}")
|
||||
|
||||
if not specified_credential_ids:
|
||||
credential_ids = possible_credential_ids
|
||||
else:
|
||||
if set(specified_credential_ids).issubset(set(possible_credential_ids)):
|
||||
credential_ids = specified_credential_ids
|
||||
else:
|
||||
raise ValueError(
|
||||
"Not all specified credentials are associated with connector"
|
||||
)
|
||||
|
||||
if not credential_ids:
|
||||
raise ValueError(
|
||||
"Connector has no valid credentials, cannot create index attempts."
|
||||
)
|
||||
|
||||
# Prevents index attempts for cc pairs that already have an index attempt currently running
|
||||
skipped_credentials = [
|
||||
credential_id
|
||||
for credential_id in credential_ids
|
||||
if get_index_attempts_for_cc_pair(
|
||||
cc_pair_identifier=ConnectorCredentialPairIdentifier(
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
),
|
||||
only_current=True,
|
||||
db_session=db_session,
|
||||
disinclude_finished=True,
|
||||
)
|
||||
]
|
||||
|
||||
connector_credential_pairs = [
|
||||
get_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
)
|
||||
for credential_id in credential_ids
|
||||
if credential_id not in skipped_credentials
|
||||
]
|
||||
|
||||
num_triggers = 0
|
||||
for cc_pair in connector_credential_pairs:
|
||||
if cc_pair is not None:
|
||||
indexing_mode = IndexingMode.UPDATE
|
||||
if from_beginning:
|
||||
indexing_mode = IndexingMode.REINDEX
|
||||
|
||||
mark_ccpair_with_indexing_trigger(cc_pair.id, indexing_mode, db_session)
|
||||
num_triggers += 1
|
||||
|
||||
logger.info(
|
||||
f"connector_run_once - marking cc_pair with indexing trigger: "
|
||||
f"connector={connector_id} "
|
||||
f"cc_pair={cc_pair.id} "
|
||||
f"indexing_trigger={indexing_mode}"
|
||||
)
|
||||
|
||||
# run the beat task to pick up the triggers immediately
|
||||
priority = OnyxCeleryPriority.HIGHEST if is_user_file else OnyxCeleryPriority.HIGH
|
||||
logger.info(f"Sending indexing check task with priority {priority}")
|
||||
primary_app.send_task(
|
||||
OnyxCeleryTask.CHECK_FOR_INDEXING,
|
||||
priority=priority,
|
||||
kwargs={"tenant_id": tenant_id},
|
||||
)
|
||||
|
||||
return num_triggers
|
||||
|
@ -122,6 +122,7 @@ class CredentialBase(BaseModel):
|
||||
name: str | None = None
|
||||
curator_public: bool = False
|
||||
groups: list[int] = Field(default_factory=list)
|
||||
is_user_file: bool = False
|
||||
|
||||
|
||||
class CredentialSnapshot(CredentialBase):
|
||||
@ -392,7 +393,7 @@ class FileUploadResponse(BaseModel):
|
||||
|
||||
|
||||
class ObjectCreationIdResponse(BaseModel):
|
||||
id: int | str
|
||||
id: int
|
||||
credential: CredentialSnapshot | None = None
|
||||
|
||||
|
||||
|
@ -18,9 +18,9 @@ from onyx.db.models import User
|
||||
from onyx.server.features.folder.models import DeleteFolderOptions
|
||||
from onyx.server.features.folder.models import FolderChatSessionRequest
|
||||
from onyx.server.features.folder.models import FolderCreationRequest
|
||||
from onyx.server.features.folder.models import FolderResponse
|
||||
from onyx.server.features.folder.models import FolderUpdateRequest
|
||||
from onyx.server.features.folder.models import GetUserFoldersResponse
|
||||
from onyx.server.features.folder.models import UserFolderSnapshot
|
||||
from onyx.server.models import DisplayPriorityRequest
|
||||
from onyx.server.query_and_chat.models import ChatSessionDetails
|
||||
|
||||
@ -39,7 +39,7 @@ def get_folders(
|
||||
folders.sort()
|
||||
return GetUserFoldersResponse(
|
||||
folders=[
|
||||
FolderResponse(
|
||||
UserFolderSnapshot(
|
||||
folder_id=folder.id,
|
||||
folder_name=folder.name,
|
||||
display_priority=folder.display_priority,
|
||||
|
@ -5,7 +5,7 @@ from pydantic import BaseModel
|
||||
from onyx.server.query_and_chat.models import ChatSessionDetails
|
||||
|
||||
|
||||
class FolderResponse(BaseModel):
|
||||
class UserFolderSnapshot(BaseModel):
|
||||
folder_id: int
|
||||
folder_name: str | None
|
||||
display_priority: int
|
||||
@ -13,7 +13,7 @@ class FolderResponse(BaseModel):
|
||||
|
||||
|
||||
class GetUserFoldersResponse(BaseModel):
|
||||
folders: list[FolderResponse]
|
||||
folders: list[UserFolderSnapshot]
|
||||
|
||||
|
||||
class FolderCreationRequest(BaseModel):
|
||||
|
@ -59,7 +59,6 @@ from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
admin_router = APIRouter(prefix="/admin/persona")
|
||||
basic_router = APIRouter(prefix="/persona")
|
||||
|
||||
@ -210,6 +209,7 @@ def create_persona(
|
||||
and len(persona_upsert_request.prompt_ids) > 0
|
||||
else None
|
||||
)
|
||||
|
||||
prompt = upsert_prompt(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
|
@ -85,6 +85,8 @@ class PersonaUpsertRequest(BaseModel):
|
||||
label_ids: list[int] | None = None
|
||||
is_default_persona: bool = False
|
||||
display_priority: int | None = None
|
||||
user_file_ids: list[int] | None = None
|
||||
user_folder_ids: list[int] | None = None
|
||||
|
||||
|
||||
class PersonaSnapshot(BaseModel):
|
||||
@ -113,6 +115,8 @@ class PersonaSnapshot(BaseModel):
|
||||
is_default_persona: bool
|
||||
search_start_date: datetime | None = None
|
||||
labels: list["PersonaLabelSnapshot"] = []
|
||||
user_file_ids: list[int] | None = None
|
||||
user_folder_ids: list[int] | None = None
|
||||
|
||||
@classmethod
|
||||
def from_model(
|
||||
@ -161,6 +165,8 @@ class PersonaSnapshot(BaseModel):
|
||||
uploaded_image_id=persona.uploaded_image_id,
|
||||
search_start_date=persona.search_start_date,
|
||||
labels=[PersonaLabelSnapshot.from_model(label) for label in persona.labels],
|
||||
user_file_ids=[file.id for file in persona.user_files],
|
||||
user_folder_ids=[folder.id for folder in persona.user_folders],
|
||||
)
|
||||
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import Depends
|
||||
@ -138,15 +140,29 @@ def list_llm_providers(
|
||||
_: User | None = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[LLMProviderView]:
|
||||
start_time = datetime.now(timezone.utc)
|
||||
logger.debug("Starting to fetch LLM providers")
|
||||
|
||||
llm_provider_list: list[LLMProviderView] = []
|
||||
for llm_provider_model in fetch_existing_llm_providers(db_session):
|
||||
from_model_start = datetime.now(timezone.utc)
|
||||
full_llm_provider = LLMProviderView.from_model(llm_provider_model)
|
||||
from_model_end = datetime.now(timezone.utc)
|
||||
from_model_duration = (from_model_end - from_model_start).total_seconds()
|
||||
logger.debug(
|
||||
f"LLMProviderView.from_model took {from_model_duration:.2f} seconds"
|
||||
)
|
||||
|
||||
if full_llm_provider.api_key:
|
||||
full_llm_provider.api_key = (
|
||||
full_llm_provider.api_key[:4] + "****" + full_llm_provider.api_key[-4:]
|
||||
)
|
||||
llm_provider_list.append(full_llm_provider)
|
||||
|
||||
end_time = datetime.now(timezone.utc)
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
logger.debug(f"Completed fetching LLM providers in {duration:.2f} seconds")
|
||||
|
||||
return llm_provider_list
|
||||
|
||||
|
||||
@ -282,12 +298,25 @@ def list_llm_provider_basics(
|
||||
user: User | None = Depends(current_chat_accessible_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[LLMProviderDescriptor]:
|
||||
return [
|
||||
LLMProviderDescriptor.from_model(llm_provider_model)
|
||||
for llm_provider_model in fetch_existing_llm_providers_for_user(
|
||||
db_session, user
|
||||
start_time = datetime.now(timezone.utc)
|
||||
logger.debug("Starting to fetch basic LLM providers for user")
|
||||
|
||||
llm_provider_list: list[LLMProviderDescriptor] = []
|
||||
for llm_provider_model in fetch_existing_llm_providers_for_user(db_session, user):
|
||||
from_model_start = datetime.now(timezone.utc)
|
||||
full_llm_provider = LLMProviderDescriptor.from_model(llm_provider_model)
|
||||
from_model_end = datetime.now(timezone.utc)
|
||||
from_model_duration = (from_model_end - from_model_start).total_seconds()
|
||||
logger.debug(
|
||||
f"LLMProviderView.from_model took {from_model_duration:.2f} seconds"
|
||||
)
|
||||
]
|
||||
llm_provider_list.append(full_llm_provider)
|
||||
|
||||
end_time = datetime.now(timezone.utc)
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
logger.debug(f"Completed fetching basic LLM providers in {duration:.2f} seconds")
|
||||
|
||||
return llm_provider_list
|
||||
|
||||
|
||||
@admin_router.get("/provider-contextual-cost")
|
||||
|
@ -4,6 +4,7 @@ from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
|
||||
from onyx.llm.llm_provider_options import fetch_models_for_provider
|
||||
from onyx.llm.utils import get_max_input_tokens
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -38,24 +39,50 @@ class LLMProviderDescriptor(BaseModel):
|
||||
is_default_vision_provider: bool | None
|
||||
default_vision_model: str | None
|
||||
display_model_names: list[str] | None
|
||||
model_token_limits: dict[str, int] | None = None
|
||||
|
||||
@classmethod
|
||||
def from_model(
|
||||
cls, llm_provider_model: "LLMProviderModel"
|
||||
) -> "LLMProviderDescriptor":
|
||||
return cls(
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
model_names = (
|
||||
llm_provider_model.model_names
|
||||
or fetch_models_for_provider(llm_provider_model.provider)
|
||||
or [llm_provider_model.default_model_name]
|
||||
)
|
||||
|
||||
model_token_rate = (
|
||||
{
|
||||
model_name: get_max_input_tokens(
|
||||
model_name, llm_provider_model.provider
|
||||
)
|
||||
for model_name in model_names
|
||||
}
|
||||
if model_names is not None
|
||||
else None
|
||||
)
|
||||
|
||||
result = cls(
|
||||
name=llm_provider_model.name,
|
||||
provider=llm_provider_model.provider,
|
||||
default_model_name=llm_provider_model.default_model_name,
|
||||
fast_default_model_name=llm_provider_model.fast_default_model_name,
|
||||
is_default_provider=llm_provider_model.is_default_provider,
|
||||
model_names=model_names,
|
||||
model_token_limits=model_token_rate,
|
||||
is_default_vision_provider=llm_provider_model.is_default_vision_provider,
|
||||
default_vision_model=llm_provider_model.default_vision_model,
|
||||
model_names=llm_provider_model.model_names
|
||||
or fetch_models_for_provider(llm_provider_model.provider),
|
||||
display_model_names=llm_provider_model.display_model_names,
|
||||
)
|
||||
|
||||
time.time() - start_time
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class LLMProvider(BaseModel):
|
||||
name: str
|
||||
@ -87,6 +114,7 @@ class LLMProviderView(LLMProvider):
|
||||
is_default_provider: bool | None = None
|
||||
is_default_vision_provider: bool | None = None
|
||||
model_names: list[str]
|
||||
model_token_limits: dict[str, int] | None = None
|
||||
|
||||
@classmethod
|
||||
def from_model(cls, llm_provider_model: "LLMProviderModel") -> "LLMProviderView":
|
||||
@ -109,6 +137,14 @@ class LLMProviderView(LLMProvider):
|
||||
or fetch_models_for_provider(llm_provider_model.provider)
|
||||
or [llm_provider_model.default_model_name]
|
||||
),
|
||||
model_token_limits={
|
||||
model_name: get_max_input_tokens(
|
||||
model_name, llm_provider_model.provider
|
||||
)
|
||||
for model_name in llm_provider_model.model_names
|
||||
}
|
||||
if llm_provider_model.model_names is not None
|
||||
else None,
|
||||
is_public=llm_provider_model.is_public,
|
||||
groups=[group.id for group in llm_provider_model.groups],
|
||||
deployment_name=llm_provider_model.deployment_name,
|
||||
|
@ -3,6 +3,7 @@ import datetime
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
@ -29,10 +30,12 @@ from onyx.chat.prompt_builder.citations_prompt import (
|
||||
compute_max_document_tokens_for_persona,
|
||||
)
|
||||
from onyx.configs.app_configs import WEB_DOMAIN
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.configs.constants import MilestoneRecordType
|
||||
from onyx.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.chat import add_chats_to_session_from_slack_thread
|
||||
from onyx.db.chat import create_chat_session
|
||||
from onyx.db.chat import create_new_chat_message
|
||||
@ -48,12 +51,17 @@ from onyx.db.chat import set_as_latest_chat_message
|
||||
from onyx.db.chat import translate_db_message_to_chat_message_detail
|
||||
from onyx.db.chat import update_chat_session
|
||||
from onyx.db.chat_search import search_chat_sessions
|
||||
from onyx.db.connector import create_connector
|
||||
from onyx.db.connector_credential_pair import add_credential_to_connector
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.feedback import create_chat_message_feedback
|
||||
from onyx.db.feedback import create_doc_retrieval_feedback
|
||||
from onyx.db.models import User
|
||||
from onyx.db.persona import get_persona_by_id
|
||||
from onyx.db.user_documents import create_user_files
|
||||
from onyx.file_processing.extract_file_text import docx_to_txt_filename
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
@ -66,6 +74,8 @@ from onyx.natural_language_processing.utils import get_tokenizer
|
||||
from onyx.secondary_llm_flows.chat_session_naming import (
|
||||
get_renamed_conversation_name,
|
||||
)
|
||||
from onyx.server.documents.models import ConnectorBase
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from onyx.server.query_and_chat.models import ChatFeedbackRequest
|
||||
from onyx.server.query_and_chat.models import ChatMessageIdentifier
|
||||
from onyx.server.query_and_chat.models import ChatRenameRequest
|
||||
@ -91,6 +101,7 @@ from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.telemetry import create_milestone_and_report
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
RECENT_DOCS_FOLDER_ID = -1
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@ -648,7 +659,7 @@ def seed_chat_from_slack(
|
||||
def upload_files_for_chat(
|
||||
files: list[UploadFile],
|
||||
db_session: Session = Depends(get_session),
|
||||
_: User | None = Depends(current_user),
|
||||
user: User | None = Depends(current_user),
|
||||
) -> dict[str, list[FileDescriptor]]:
|
||||
image_content_types = {"image/jpeg", "image/png", "image/webp"}
|
||||
csv_content_types = {"text/csv"}
|
||||
@ -686,17 +697,11 @@ def upload_files_for_chat(
|
||||
if file.content_type in image_content_types:
|
||||
error_detail = "Unsupported image file type. Supported image types include .jpg, .jpeg, .png, .webp."
|
||||
elif file.content_type in text_content_types:
|
||||
error_detail = "Unsupported text file type. Supported text types include .txt, .csv, .md, .mdx, .conf, "
|
||||
".log, .tsv."
|
||||
error_detail = "Unsupported text file type."
|
||||
elif file.content_type in csv_content_types:
|
||||
error_detail = (
|
||||
"Unsupported CSV file type. Supported CSV types include .csv."
|
||||
)
|
||||
error_detail = "Unsupported CSV file type."
|
||||
else:
|
||||
error_detail = (
|
||||
"Unsupported document file type. Supported document types include .pdf, .docx, .pptx, .xlsx, "
|
||||
".json, .xml, .yml, .yaml, .eml, .epub."
|
||||
)
|
||||
error_detail = "Unsupported document file type."
|
||||
raise HTTPException(status_code=400, detail=error_detail)
|
||||
|
||||
if (
|
||||
@ -744,11 +749,12 @@ def upload_files_for_chat(
|
||||
file_type=new_content_type or file_type.value,
|
||||
)
|
||||
|
||||
# if the file is a doc, extract text and store that so we don't need
|
||||
# to re-extract it every time we send a message
|
||||
# 4) If the file is a doc, extract text and store that separately
|
||||
if file_type == ChatFileType.DOC:
|
||||
# Re-wrap bytes in a fresh BytesIO so we start at position 0
|
||||
extracted_text_io = io.BytesIO(file_content)
|
||||
extracted_text = extract_file_text(
|
||||
file=file_content_io, # use the bytes we already read
|
||||
file=extracted_text_io, # use the bytes we already read
|
||||
file_name=file.filename or "",
|
||||
)
|
||||
text_file_id = str(uuid.uuid4())
|
||||
@ -760,13 +766,57 @@ def upload_files_for_chat(
|
||||
file_origin=FileOrigin.CHAT_UPLOAD,
|
||||
file_type="text/plain",
|
||||
)
|
||||
# for DOC type, just return this for the FileDescriptor
|
||||
# as we would always use this as the ID to attach to the
|
||||
# message
|
||||
# Return the text file as the "main" file descriptor for doc types
|
||||
file_info.append((text_file_id, file.filename, ChatFileType.PLAIN_TEXT))
|
||||
else:
|
||||
file_info.append((file_id, file.filename, file_type))
|
||||
|
||||
# 5) Create a user file for each uploaded file
|
||||
user_files = create_user_files([file], RECENT_DOCS_FOLDER_ID, user, db_session)
|
||||
for user_file in user_files:
|
||||
# 6) Create connector
|
||||
connector_base = ConnectorBase(
|
||||
name=f"UserFile-{int(time.time())}",
|
||||
source=DocumentSource.FILE,
|
||||
input_type=InputType.LOAD_STATE,
|
||||
connector_specific_config={
|
||||
"file_locations": [user_file.file_id],
|
||||
},
|
||||
refresh_freq=None,
|
||||
prune_freq=None,
|
||||
indexing_start=None,
|
||||
)
|
||||
connector = create_connector(
|
||||
db_session=db_session,
|
||||
connector_data=connector_base,
|
||||
)
|
||||
|
||||
# 7) Create credential
|
||||
credential_info = CredentialBase(
|
||||
credential_json={},
|
||||
admin_public=True,
|
||||
source=DocumentSource.FILE,
|
||||
curator_public=True,
|
||||
groups=[],
|
||||
name=f"UserFileCredential-{int(time.time())}",
|
||||
is_user_file=True,
|
||||
)
|
||||
credential = create_credential(credential_info, user, db_session)
|
||||
|
||||
# 8) Create connector credential pair
|
||||
cc_pair = add_credential_to_connector(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
connector_id=connector.id,
|
||||
credential_id=credential.id,
|
||||
cc_pair_name=f"UserFileCCPair-{int(time.time())}",
|
||||
access_type=AccessType.PRIVATE,
|
||||
auto_sync_options=None,
|
||||
groups=[],
|
||||
)
|
||||
user_file.cc_pair_id = cc_pair.data
|
||||
db_session.commit()
|
||||
|
||||
return {
|
||||
"files": [
|
||||
{"id": file_id, "type": file_type, "name": file_name}
|
||||
|
@ -92,6 +92,8 @@ class CreateChatMessageRequest(ChunkContext):
|
||||
message: str
|
||||
# Files that we should attach to this message
|
||||
file_descriptors: list[FileDescriptor]
|
||||
user_file_ids: list[int] = []
|
||||
user_folder_ids: list[int] = []
|
||||
|
||||
# If no prompt provided, uses the largest prompt of the chat session
|
||||
# but really this should be explicitly specified, only in the simplified APIs is this inferred
|
||||
@ -118,7 +120,7 @@ class CreateChatMessageRequest(ChunkContext):
|
||||
# this does persist in the chat thread details
|
||||
temperature_override: float | None = None
|
||||
|
||||
# allow user to specify an alternate assistnat
|
||||
# allow user to specify an alternate assistant
|
||||
alternate_assistant_id: int | None = None
|
||||
|
||||
# This takes the priority over the prompt_override
|
||||
@ -135,6 +137,8 @@ class CreateChatMessageRequest(ChunkContext):
|
||||
# https://platform.openai.com/docs/guides/structured-outputs/introduction
|
||||
structured_response_format: dict | None = None
|
||||
|
||||
force_user_file_search: bool = False
|
||||
|
||||
# If true, ignores most of the search options and uses pro search instead.
|
||||
# TODO: decide how many of the above options we want to pass through to pro search
|
||||
use_agentic_search: bool = False
|
||||
|
567
backend/onyx/server/user_documents/api.py
Normal file
@ -0,0 +1,567 @@
|
||||
import io
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
import sqlalchemy.exc
|
||||
from bs4 import BeautifulSoup
|
||||
from fastapi import APIRouter
|
||||
from fastapi import Depends
|
||||
from fastapi import File
|
||||
from fastapi import Form
|
||||
from fastapi import HTTPException
|
||||
from fastapi import Query
|
||||
from fastapi import UploadFile
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.auth.users import current_user
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.connector import create_connector
|
||||
from onyx.db.connector_credential_pair import add_credential_to_connector
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserFolder
|
||||
from onyx.db.user_documents import calculate_user_files_token_count
|
||||
from onyx.db.user_documents import create_user_file_with_indexing
|
||||
from onyx.db.user_documents import create_user_files
|
||||
from onyx.db.user_documents import get_user_file_indexing_status
|
||||
from onyx.db.user_documents import share_file_with_assistant
|
||||
from onyx.db.user_documents import share_folder_with_assistant
|
||||
from onyx.db.user_documents import unshare_file_with_assistant
|
||||
from onyx.db.user_documents import unshare_folder_with_assistant
|
||||
from onyx.file_processing.html_utils import web_html_cleanup
|
||||
from onyx.server.documents.connector import trigger_indexing_for_cc_pair
|
||||
from onyx.server.documents.models import ConnectorBase
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from onyx.server.user_documents.models import MessageResponse
|
||||
from onyx.server.user_documents.models import UserFileSnapshot
|
||||
from onyx.server.user_documents.models import UserFolderSnapshot
|
||||
from onyx.setup import setup_logger
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class FolderCreationRequest(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
|
||||
|
||||
@router.post("/user/folder")
|
||||
def create_folder(
|
||||
request: FolderCreationRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> UserFolderSnapshot:
|
||||
try:
|
||||
new_folder = UserFolder(
|
||||
user_id=user.id if user else None,
|
||||
name=request.name,
|
||||
description=request.description,
|
||||
)
|
||||
db_session.add(new_folder)
|
||||
db_session.commit()
|
||||
return UserFolderSnapshot.from_model(new_folder)
|
||||
except sqlalchemy.exc.DataError as e:
|
||||
if "StringDataRightTruncation" in str(e):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Folder name or description is too long. Please use a shorter name or description.",
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
@router.get(
|
||||
"/user/folder",
|
||||
)
|
||||
def get_folders(
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[UserFolderSnapshot]:
|
||||
user_id = user.id if user else None
|
||||
folders = db_session.query(UserFolder).filter(UserFolder.user_id == user_id).all()
|
||||
return [UserFolderSnapshot.from_model(folder) for folder in folders]
|
||||
|
||||
|
||||
@router.get("/user/folder/{folder_id}")
|
||||
def get_folder(
|
||||
folder_id: int,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> UserFolderSnapshot:
|
||||
user_id = user.id if user else None
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
|
||||
return UserFolderSnapshot.from_model(folder)
|
||||
|
||||
|
||||
RECENT_DOCS_FOLDER_ID = -1
|
||||
|
||||
|
||||
@router.post("/user/file/upload")
|
||||
def upload_user_files(
|
||||
files: List[UploadFile] = File(...),
|
||||
folder_id: int | None = Form(None),
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[UserFileSnapshot]:
|
||||
if folder_id == 0:
|
||||
folder_id = None
|
||||
|
||||
try:
|
||||
# Use our consolidated function that handles indexing properly
|
||||
user_files = create_user_file_with_indexing(
|
||||
files, folder_id or -1, user, db_session
|
||||
)
|
||||
|
||||
return [UserFileSnapshot.from_model(user_file) for user_file in user_files]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading files: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to upload files: {str(e)}")
|
||||
|
||||
|
||||
class FolderUpdateRequest(BaseModel):
|
||||
name: str | None = None
|
||||
description: str | None = None
|
||||
|
||||
|
||||
@router.put("/user/folder/{folder_id}")
|
||||
def update_folder(
|
||||
folder_id: int,
|
||||
request: FolderUpdateRequest,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> UserFolderSnapshot:
|
||||
user_id = user.id if user else None
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
if request.name:
|
||||
folder.name = request.name
|
||||
if request.description:
|
||||
folder.description = request.description
|
||||
db_session.commit()
|
||||
|
||||
return UserFolderSnapshot.from_model(folder)
|
||||
|
||||
|
||||
@router.delete("/user/folder/{folder_id}")
|
||||
def delete_folder(
|
||||
folder_id: int,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
db_session.delete(folder)
|
||||
db_session.commit()
|
||||
return MessageResponse(message="Folder deleted successfully")
|
||||
|
||||
|
||||
@router.delete("/user/file/{file_id}")
|
||||
def delete_file(
|
||||
file_id: int,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
db_session.delete(file)
|
||||
db_session.commit()
|
||||
return MessageResponse(message="File deleted successfully")
|
||||
|
||||
|
||||
class FileMoveRequest(BaseModel):
|
||||
new_folder_id: int | None
|
||||
|
||||
|
||||
@router.put("/user/file/{file_id}/move")
|
||||
def move_file(
|
||||
file_id: int,
|
||||
request: FileMoveRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> UserFileSnapshot:
|
||||
user_id = user.id if user else None
|
||||
file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
file.folder_id = request.new_folder_id
|
||||
db_session.commit()
|
||||
return UserFileSnapshot.from_model(file)
|
||||
|
||||
|
||||
@router.get("/user/file-system")
|
||||
def get_file_system(
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[UserFolderSnapshot]:
|
||||
user_id = user.id if user else None
|
||||
folders = db_session.query(UserFolder).filter(UserFolder.user_id == user_id).all()
|
||||
return [UserFolderSnapshot.from_model(folder) for folder in folders]
|
||||
|
||||
|
||||
@router.put("/user/file/{file_id}/rename")
|
||||
def rename_file(
|
||||
file_id: int,
|
||||
name: str,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> UserFileSnapshot:
|
||||
user_id = user.id if user else None
|
||||
file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
file.name = name
|
||||
db_session.commit()
|
||||
return UserFileSnapshot.from_model(file)
|
||||
|
||||
|
||||
class ShareRequest(BaseModel):
|
||||
assistant_id: int
|
||||
|
||||
|
||||
@router.post("/user/file/{file_id}/share")
|
||||
def share_file(
|
||||
file_id: int,
|
||||
request: ShareRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
share_file_with_assistant(file_id, request.assistant_id, db_session)
|
||||
return MessageResponse(message="File shared successfully with the assistant")
|
||||
|
||||
|
||||
@router.post("/user/file/{file_id}/unshare")
|
||||
def unshare_file(
|
||||
file_id: int,
|
||||
request: ShareRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
unshare_file_with_assistant(file_id, request.assistant_id, db_session)
|
||||
return MessageResponse(message="File unshared successfully from the assistant")
|
||||
|
||||
|
||||
@router.post("/user/folder/{folder_id}/share")
|
||||
def share_folder(
|
||||
folder_id: int,
|
||||
request: ShareRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
|
||||
share_folder_with_assistant(folder_id, request.assistant_id, db_session)
|
||||
return MessageResponse(
|
||||
message="Folder and its files shared successfully with the assistant"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/user/folder/{folder_id}/unshare")
|
||||
def unshare_folder(
|
||||
folder_id: int,
|
||||
request: ShareRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
|
||||
unshare_folder_with_assistant(folder_id, request.assistant_id, db_session)
|
||||
return MessageResponse(
|
||||
message="Folder and its files unshared successfully from the assistant"
|
||||
)
|
||||
|
||||
|
||||
class CreateFileFromLinkRequest(BaseModel):
|
||||
url: str
|
||||
folder_id: int | None
|
||||
|
||||
|
||||
@router.post("/user/file/create-from-link")
|
||||
def create_file_from_link(
|
||||
request: CreateFileFromLinkRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[UserFileSnapshot]:
|
||||
try:
|
||||
response = requests.get(request.url)
|
||||
response.raise_for_status()
|
||||
content = response.text
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
parsed_html = web_html_cleanup(soup, mintlify_cleanup_enabled=False)
|
||||
|
||||
file_name = f"{parsed_html.title or 'Untitled'}.txt"
|
||||
file_content = parsed_html.cleaned_text.encode()
|
||||
|
||||
file = UploadFile(filename=file_name, file=io.BytesIO(file_content))
|
||||
user_files = create_user_files(
|
||||
[file], request.folder_id or -1, user, db_session, link_url=request.url
|
||||
)
|
||||
|
||||
# Create connector and credential (same as in upload_user_files)
|
||||
for user_file in user_files:
|
||||
connector_base = ConnectorBase(
|
||||
name=f"UserFile-{user_file.file_id}-{int(time.time())}",
|
||||
source=DocumentSource.FILE,
|
||||
input_type=InputType.LOAD_STATE,
|
||||
connector_specific_config={
|
||||
"file_locations": [user_file.file_id],
|
||||
},
|
||||
refresh_freq=None,
|
||||
prune_freq=None,
|
||||
indexing_start=None,
|
||||
)
|
||||
|
||||
connector = create_connector(
|
||||
db_session=db_session,
|
||||
connector_data=connector_base,
|
||||
)
|
||||
|
||||
credential_info = CredentialBase(
|
||||
credential_json={},
|
||||
admin_public=True,
|
||||
source=DocumentSource.FILE,
|
||||
curator_public=True,
|
||||
groups=[],
|
||||
name=f"UserFileCredential-{user_file.file_id}-{int(time.time())}",
|
||||
)
|
||||
credential = create_credential(credential_info, user, db_session)
|
||||
|
||||
cc_pair = add_credential_to_connector(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
connector_id=connector.id,
|
||||
credential_id=credential.id,
|
||||
cc_pair_name=f"UserFileCCPair-{int(time.time())}",
|
||||
access_type=AccessType.PRIVATE,
|
||||
auto_sync_options=None,
|
||||
groups=[],
|
||||
is_user_file=True,
|
||||
)
|
||||
user_file.cc_pair_id = cc_pair.data
|
||||
db_session.commit()
|
||||
|
||||
# Trigger immediate indexing with highest priority
|
||||
tenant_id = get_current_tenant_id()
|
||||
trigger_indexing_for_cc_pair(
|
||||
[], connector.id, False, tenant_id, db_session, is_user_file=True
|
||||
)
|
||||
|
||||
db_session.commit()
|
||||
return [UserFileSnapshot.from_model(user_file) for user_file in user_files]
|
||||
except requests.RequestException as e:
|
||||
raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/user/file/indexing-status")
|
||||
def get_files_indexing_status(
|
||||
file_ids: list[int] = Query(...),
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> dict[int, bool]:
|
||||
"""Get indexing status for multiple files"""
|
||||
return get_user_file_indexing_status(file_ids, db_session)
|
||||
|
||||
|
||||
@router.get("/user/file/token-estimate")
|
||||
def get_files_token_estimate(
|
||||
file_ids: list[int] = Query([]),
|
||||
folder_ids: list[int] = Query([]),
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> dict:
|
||||
"""Get token estimate for files and folders"""
|
||||
total_tokens = calculate_user_files_token_count(file_ids, folder_ids, db_session)
|
||||
return {"total_tokens": total_tokens}
|
||||
|
||||
|
||||
class ReindexFileRequest(BaseModel):
|
||||
file_id: int
|
||||
|
||||
|
||||
@router.post("/user/file/reindex")
|
||||
def reindex_file(
|
||||
request: ReindexFileRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
user_id = user.id if user else None
|
||||
user_file_to_reindex = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == request.file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not user_file_to_reindex:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
if not user_file_to_reindex.cc_pair_id:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="File does not have an associated connector-credential pair",
|
||||
)
|
||||
|
||||
# Get the connector id from the cc_pair
|
||||
cc_pair = (
|
||||
db_session.query(ConnectorCredentialPair)
|
||||
.filter_by(id=user_file_to_reindex.cc_pair_id)
|
||||
.first()
|
||||
)
|
||||
if not cc_pair:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Associated connector-credential pair not found"
|
||||
)
|
||||
|
||||
# Trigger immediate reindexing with highest priority
|
||||
tenant_id = get_current_tenant_id()
|
||||
# Update the cc_pair status to ACTIVE to ensure it's processed
|
||||
cc_pair.status = ConnectorCredentialPairStatus.ACTIVE
|
||||
db_session.commit()
|
||||
try:
|
||||
trigger_indexing_for_cc_pair(
|
||||
[], cc_pair.connector_id, True, tenant_id, db_session, is_user_file=True
|
||||
)
|
||||
return MessageResponse(
|
||||
message="File reindexing has been triggered successfully"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error triggering reindexing for file {request.file_id}: {str(e)}"
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to trigger reindexing: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
class BulkCleanupRequest(BaseModel):
|
||||
folder_id: int
|
||||
days_older_than: int | None = None
|
||||
|
||||
|
||||
@router.post("/user/file/bulk-cleanup")
|
||||
def bulk_cleanup_files(
|
||||
request: BulkCleanupRequest,
|
||||
user: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> MessageResponse:
|
||||
"""Bulk delete files older than specified days in a folder"""
|
||||
user_id = user.id if user else None
|
||||
|
||||
logger.info(
|
||||
f"Bulk cleanup request: folder_id={request.folder_id}, days_older_than={request.days_older_than}"
|
||||
)
|
||||
|
||||
# Check if folder exists
|
||||
if request.folder_id != RECENT_DOCS_FOLDER_ID:
|
||||
folder = (
|
||||
db_session.query(UserFolder)
|
||||
.filter(UserFolder.id == request.folder_id, UserFolder.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
if not folder:
|
||||
raise HTTPException(status_code=404, detail="Folder not found")
|
||||
|
||||
filter_criteria = [UserFile.user_id == user_id]
|
||||
|
||||
# Filter by folder
|
||||
if request.folder_id != -2: # -2 means all folders
|
||||
filter_criteria.append(UserFile.folder_id == request.folder_id)
|
||||
|
||||
# Filter by date if days_older_than is provided
|
||||
if request.days_older_than is not None:
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=request.days_older_than)
|
||||
logger.info(f"Filtering files older than {cutoff_date} (UTC)")
|
||||
filter_criteria.append(UserFile.created_at < cutoff_date)
|
||||
|
||||
# Get all files matching the criteria
|
||||
files_to_delete = db_session.query(UserFile).filter(*filter_criteria).all()
|
||||
|
||||
logger.info(f"Found {len(files_to_delete)} files to delete")
|
||||
|
||||
# Delete files
|
||||
delete_count = 0
|
||||
for file in files_to_delete:
|
||||
logger.debug(
|
||||
f"Deleting file: id={file.id}, name={file.name}, created_at={file.created_at}"
|
||||
)
|
||||
db_session.delete(file)
|
||||
delete_count += 1
|
||||
|
||||
db_session.commit()
|
||||
|
||||
return MessageResponse(message=f"Successfully deleted {delete_count} files")
|
101
backend/onyx/server/user_documents/models.py
Normal file
@ -0,0 +1,101 @@
|
||||
from datetime import datetime
|
||||
from enum import Enum as PyEnum
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.models import UserFolder
|
||||
|
||||
|
||||
class UserFileStatus(str, PyEnum):
|
||||
FAILED = "FAILED"
|
||||
INDEXING = "INDEXING"
|
||||
INDEXED = "INDEXED"
|
||||
REINDEXING = "REINDEXING"
|
||||
|
||||
|
||||
class UserFileSnapshot(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
document_id: str
|
||||
folder_id: int | None = None
|
||||
user_id: UUID | None
|
||||
file_id: str
|
||||
created_at: datetime
|
||||
assistant_ids: List[int] = [] # List of assistant IDs
|
||||
token_count: int | None
|
||||
indexed: bool
|
||||
link_url: str | None
|
||||
status: UserFileStatus
|
||||
|
||||
@classmethod
|
||||
def from_model(cls, model: UserFile) -> "UserFileSnapshot":
|
||||
return cls(
|
||||
id=model.id,
|
||||
name=model.name[:-4]
|
||||
if model.link_url and model.name.endswith(".txt")
|
||||
else model.name,
|
||||
folder_id=model.folder_id,
|
||||
document_id=model.document_id,
|
||||
user_id=model.user_id,
|
||||
file_id=model.file_id,
|
||||
created_at=model.created_at,
|
||||
assistant_ids=[assistant.id for assistant in model.assistants],
|
||||
token_count=model.token_count,
|
||||
status=(
|
||||
UserFileStatus.FAILED
|
||||
if model.cc_pair
|
||||
and len(model.cc_pair.index_attempts) > 0
|
||||
and model.cc_pair.last_successful_index_time is None
|
||||
and model.cc_pair.status == ConnectorCredentialPairStatus.PAUSED
|
||||
else UserFileStatus.INDEXED
|
||||
if model.cc_pair
|
||||
and model.cc_pair.last_successful_index_time is not None
|
||||
else UserFileStatus.REINDEXING
|
||||
if model.cc_pair
|
||||
and len(model.cc_pair.index_attempts) > 1
|
||||
and model.cc_pair.last_successful_index_time is None
|
||||
and model.cc_pair.status != ConnectorCredentialPairStatus.PAUSED
|
||||
else UserFileStatus.INDEXING
|
||||
),
|
||||
indexed=model.cc_pair.last_successful_index_time is not None
|
||||
if model.cc_pair
|
||||
else False,
|
||||
link_url=model.link_url,
|
||||
)
|
||||
|
||||
|
||||
class UserFolderSnapshot(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
description: str
|
||||
files: List[UserFileSnapshot]
|
||||
created_at: datetime
|
||||
user_id: UUID | None
|
||||
assistant_ids: List[int] = [] # List of assistant IDs
|
||||
token_count: int | None
|
||||
|
||||
@classmethod
|
||||
def from_model(cls, model: UserFolder) -> "UserFolderSnapshot":
|
||||
return cls(
|
||||
id=model.id,
|
||||
name=model.name,
|
||||
description=model.description,
|
||||
files=[UserFileSnapshot.from_model(file) for file in model.files],
|
||||
created_at=model.created_at,
|
||||
user_id=model.user_id,
|
||||
assistant_ids=[assistant.id for assistant in model.assistants],
|
||||
token_count=sum(file.token_count or 0 for file in model.files) or None,
|
||||
)
|
||||
|
||||
|
||||
class MessageResponse(BaseModel):
|
||||
message: str
|
||||
|
||||
|
||||
class FileSystemResponse(BaseModel):
|
||||
folders: list[UserFolderSnapshot]
|
||||
files: list[UserFileSnapshot]
|
@ -12,6 +12,7 @@ class ForceUseTool(BaseModel):
|
||||
force_use: bool
|
||||
tool_name: str
|
||||
args: dict[str, Any] | None = None
|
||||
override_kwargs: Any = None # This will hold tool-specific override kwargs
|
||||
|
||||
def build_openai_tool_choice_dict(self) -> dict[str, Any]:
|
||||
"""Build dict in the format that OpenAI expects which tells them to use this tool."""
|
||||
|
@ -70,6 +70,11 @@ class SearchToolOverrideKwargs(BaseModel):
|
||||
precomputed_query_embedding: Embedding | None = None
|
||||
precomputed_is_keyword: bool | None = None
|
||||
precomputed_keywords: list[str] | None = None
|
||||
user_file_ids: list[int] | None = None
|
||||
user_folder_ids: list[int] | None = None
|
||||
ordering_only: bool | None = (
|
||||
None # Flag for fast path when search is only needed for ordering
|
||||
)
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
@ -138,10 +138,12 @@ def construct_tools(
|
||||
user: User | None,
|
||||
llm: LLM,
|
||||
fast_llm: LLM,
|
||||
use_file_search: bool,
|
||||
search_tool_config: SearchToolConfig | None = None,
|
||||
internet_search_tool_config: InternetSearchToolConfig | None = None,
|
||||
image_generation_tool_config: ImageGenerationToolConfig | None = None,
|
||||
custom_tool_config: CustomToolConfig | None = None,
|
||||
user_knowledge_present: bool = False,
|
||||
) -> dict[int, list[Tool]]:
|
||||
"""Constructs tools based on persona configuration and available APIs"""
|
||||
tool_dict: dict[int, list[Tool]] = {}
|
||||
@ -158,7 +160,7 @@ def construct_tools(
|
||||
)
|
||||
|
||||
# Handle Search Tool
|
||||
if tool_cls.__name__ == SearchTool.__name__:
|
||||
if tool_cls.__name__ == SearchTool.__name__ and not user_knowledge_present:
|
||||
if not search_tool_config:
|
||||
search_tool_config = SearchToolConfig()
|
||||
|
||||
@ -251,6 +253,33 @@ def construct_tools(
|
||||
for tool_list in tool_dict.values():
|
||||
tools.extend(tool_list)
|
||||
|
||||
if use_file_search:
|
||||
search_tool_config = SearchToolConfig()
|
||||
|
||||
search_tool = SearchTool(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
persona=persona,
|
||||
retrieval_options=search_tool_config.retrieval_options,
|
||||
prompt_config=prompt_config,
|
||||
llm=llm,
|
||||
fast_llm=fast_llm,
|
||||
pruning_config=search_tool_config.document_pruning_config,
|
||||
answer_style_config=search_tool_config.answer_style_config,
|
||||
selected_sections=search_tool_config.selected_sections,
|
||||
chunks_above=search_tool_config.chunks_above,
|
||||
chunks_below=search_tool_config.chunks_below,
|
||||
full_doc=search_tool_config.full_doc,
|
||||
evaluation_type=(
|
||||
LLMEvaluationType.BASIC
|
||||
if persona.llm_relevance_filter
|
||||
else LLMEvaluationType.SKIP
|
||||
),
|
||||
rerank_settings=search_tool_config.rerank_settings,
|
||||
bypass_acl=search_tool_config.bypass_acl,
|
||||
)
|
||||
tool_dict[1] = [search_tool]
|
||||
|
||||
# factor in tool definition size when pruning
|
||||
if search_tool_config:
|
||||
search_tool_config.document_pruning_config.tool_num_tokens = (
|
||||
|
@ -64,7 +64,7 @@ logger = setup_logger()
|
||||
CUSTOM_TOOL_RESPONSE_ID = "custom_tool_response"
|
||||
|
||||
|
||||
class CustomToolFileResponse(BaseModel):
|
||||
class CustomToolUserFileSnapshot(BaseModel):
|
||||
file_ids: List[str] # References to saved images or CSVs
|
||||
|
||||
|
||||
@ -131,7 +131,7 @@ class CustomTool(BaseTool):
|
||||
response = cast(CustomToolCallSummary, args[0].response)
|
||||
|
||||
if response.response_type == "image" or response.response_type == "csv":
|
||||
image_response = cast(CustomToolFileResponse, response.tool_result)
|
||||
image_response = cast(CustomToolUserFileSnapshot, response.tool_result)
|
||||
return json.dumps({"file_ids": image_response.file_ids})
|
||||
|
||||
# For JSON or other responses, return as-is
|
||||
@ -267,14 +267,14 @@ class CustomTool(BaseTool):
|
||||
file_ids = self._save_and_get_file_references(
|
||||
response.content, content_type
|
||||
)
|
||||
tool_result = CustomToolFileResponse(file_ids=file_ids)
|
||||
tool_result = CustomToolUserFileSnapshot(file_ids=file_ids)
|
||||
response_type = "csv"
|
||||
|
||||
elif "image/" in content_type:
|
||||
file_ids = self._save_and_get_file_references(
|
||||
response.content, content_type
|
||||
)
|
||||
tool_result = CustomToolFileResponse(file_ids=file_ids)
|
||||
tool_result = CustomToolUserFileSnapshot(file_ids=file_ids)
|
||||
response_type = "image"
|
||||
|
||||
else:
|
||||
@ -358,7 +358,7 @@ class CustomTool(BaseTool):
|
||||
|
||||
def final_result(self, *args: ToolResponse) -> JSON_ro:
|
||||
response = cast(CustomToolCallSummary, args[0].response)
|
||||
if isinstance(response.tool_result, CustomToolFileResponse):
|
||||
if isinstance(response.tool_result, CustomToolUserFileSnapshot):
|
||||
return response.tool_result.model_dump()
|
||||
return response.tool_result
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
@ -23,6 +24,8 @@ from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
|
||||
from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
|
||||
from onyx.context.search.enums import LLMEvaluationType
|
||||
from onyx.context.search.enums import QueryFlow
|
||||
from onyx.context.search.enums import SearchType
|
||||
from onyx.context.search.models import BaseFilters
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceSection
|
||||
from onyx.context.search.models import RerankingDetails
|
||||
@ -286,6 +289,9 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
alternate_db_session = None
|
||||
retrieved_sections_callback = None
|
||||
skip_query_analysis = False
|
||||
user_file_ids = None
|
||||
user_folder_ids = None
|
||||
ordering_only = False
|
||||
if override_kwargs:
|
||||
force_no_rerank = use_alt_not_None(override_kwargs.force_no_rerank, False)
|
||||
alternate_db_session = override_kwargs.alternate_db_session
|
||||
@ -293,13 +299,41 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
skip_query_analysis = use_alt_not_None(
|
||||
override_kwargs.skip_query_analysis, False
|
||||
)
|
||||
precomputed_query_embedding = override_kwargs.precomputed_query_embedding
|
||||
precomputed_is_keyword = override_kwargs.precomputed_is_keyword
|
||||
precomputed_keywords = override_kwargs.precomputed_keywords
|
||||
user_file_ids = override_kwargs.user_file_ids
|
||||
user_folder_ids = override_kwargs.user_folder_ids
|
||||
ordering_only = use_alt_not_None(override_kwargs.ordering_only, False)
|
||||
|
||||
# Fast path for ordering-only search
|
||||
if ordering_only:
|
||||
yield from self._run_ordering_only_search(
|
||||
query, user_file_ids, user_folder_ids
|
||||
)
|
||||
return
|
||||
|
||||
if self.selected_sections:
|
||||
yield from self._build_response_for_specified_sections(query)
|
||||
return
|
||||
|
||||
# Create a copy of the retrieval options with user_file_ids if provided
|
||||
retrieval_options = self.retrieval_options
|
||||
if (user_file_ids or user_folder_ids) and retrieval_options:
|
||||
# Create a copy to avoid modifying the original
|
||||
filters = (
|
||||
retrieval_options.filters.model_copy()
|
||||
if retrieval_options.filters
|
||||
else BaseFilters()
|
||||
)
|
||||
filters.user_file_ids = user_file_ids
|
||||
retrieval_options = retrieval_options.model_copy(
|
||||
update={"filters": filters}
|
||||
)
|
||||
elif user_file_ids or user_folder_ids:
|
||||
# Create new retrieval options with user_file_ids
|
||||
filters = BaseFilters(
|
||||
user_file_ids=user_file_ids, user_folder_ids=user_folder_ids
|
||||
)
|
||||
retrieval_options = RetrievalDetails(filters=filters)
|
||||
|
||||
search_pipeline = SearchPipeline(
|
||||
search_request=SearchRequest(
|
||||
query=query,
|
||||
@ -307,13 +341,11 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
if force_no_rerank
|
||||
else self.evaluation_type,
|
||||
human_selected_filters=(
|
||||
self.retrieval_options.filters if self.retrieval_options else None
|
||||
retrieval_options.filters if retrieval_options else None
|
||||
),
|
||||
persona=self.persona,
|
||||
offset=(
|
||||
self.retrieval_options.offset if self.retrieval_options else None
|
||||
),
|
||||
limit=self.retrieval_options.limit if self.retrieval_options else None,
|
||||
offset=(retrieval_options.offset if retrieval_options else None),
|
||||
limit=retrieval_options.limit if retrieval_options else None,
|
||||
rerank_settings=RerankingDetails(
|
||||
rerank_model_name=None,
|
||||
rerank_api_url=None,
|
||||
@ -328,8 +360,8 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
chunks_below=self.chunks_below,
|
||||
full_doc=self.full_doc,
|
||||
enable_auto_detect_filters=(
|
||||
self.retrieval_options.enable_auto_detect_filters
|
||||
if self.retrieval_options
|
||||
retrieval_options.enable_auto_detect_filters
|
||||
if retrieval_options
|
||||
else None
|
||||
),
|
||||
precomputed_query_embedding=precomputed_query_embedding,
|
||||
@ -387,6 +419,104 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
prompt_config=self.prompt_config,
|
||||
)
|
||||
|
||||
def _run_ordering_only_search(
|
||||
self,
|
||||
query: str,
|
||||
user_file_ids: list[int] | None,
|
||||
user_folder_ids: list[int] | None,
|
||||
) -> Generator[ToolResponse, None, None]:
|
||||
"""Optimized search that only retrieves document order with minimal processing."""
|
||||
start_time = time.time()
|
||||
|
||||
logger.info("Fast path: Starting optimized ordering-only search")
|
||||
|
||||
# Create temporary search pipeline for optimized retrieval
|
||||
search_pipeline = SearchPipeline(
|
||||
search_request=SearchRequest(
|
||||
query=query,
|
||||
evaluation_type=LLMEvaluationType.SKIP, # Force skip evaluation
|
||||
persona=self.persona,
|
||||
# Minimal configuration needed
|
||||
chunks_above=0,
|
||||
chunks_below=0,
|
||||
),
|
||||
user=self.user,
|
||||
llm=self.llm,
|
||||
fast_llm=self.fast_llm,
|
||||
skip_query_analysis=True, # Skip unnecessary analysis
|
||||
db_session=self.db_session,
|
||||
bypass_acl=self.bypass_acl,
|
||||
prompt_config=self.prompt_config,
|
||||
)
|
||||
|
||||
# Log what we're doing
|
||||
logger.info(
|
||||
f"Fast path: Using {len(user_file_ids or [])} files and {len(user_folder_ids or [])} folders"
|
||||
)
|
||||
|
||||
# Get chunks using the optimized method in SearchPipeline
|
||||
retrieval_start = time.time()
|
||||
retrieved_chunks = search_pipeline.get_ordering_only_chunks(
|
||||
query=query, user_file_ids=user_file_ids, user_folder_ids=user_folder_ids
|
||||
)
|
||||
retrieval_time = time.time() - retrieval_start
|
||||
|
||||
logger.info(
|
||||
f"Fast path: Retrieved {len(retrieved_chunks)} chunks in {retrieval_time:.2f}s"
|
||||
)
|
||||
|
||||
# Convert chunks to minimal sections (we don't need full content)
|
||||
minimal_sections = []
|
||||
for chunk in retrieved_chunks:
|
||||
# Create a minimal section with just center_chunk
|
||||
minimal_section = InferenceSection(
|
||||
center_chunk=chunk,
|
||||
chunks=[chunk],
|
||||
combined_content=chunk.content, # Use the chunk content as combined content
|
||||
)
|
||||
minimal_sections.append(minimal_section)
|
||||
|
||||
# Log document IDs found for debugging
|
||||
doc_ids = [chunk.document_id for chunk in retrieved_chunks]
|
||||
logger.info(
|
||||
f"Fast path: Document IDs in order: {doc_ids[:5]}{'...' if len(doc_ids) > 5 else ''}"
|
||||
)
|
||||
|
||||
# Yield just the required responses for document ordering
|
||||
yield ToolResponse(
|
||||
id=SEARCH_RESPONSE_SUMMARY_ID,
|
||||
response=SearchResponseSummary(
|
||||
rephrased_query=query,
|
||||
top_sections=minimal_sections,
|
||||
predicted_flow=QueryFlow.QUESTION_ANSWER,
|
||||
predicted_search=SearchType.SEMANTIC,
|
||||
final_filters=IndexFilters(
|
||||
user_file_ids=user_file_ids or [],
|
||||
user_folder_ids=user_folder_ids or [],
|
||||
access_control_list=None,
|
||||
),
|
||||
recency_bias_multiplier=1.0,
|
||||
),
|
||||
)
|
||||
|
||||
# For fast path, don't trigger any LLM evaluation for relevance
|
||||
logger.info(
|
||||
"Fast path: Skipping section relevance evaluation to optimize performance"
|
||||
)
|
||||
yield ToolResponse(
|
||||
id=SECTION_RELEVANCE_LIST_ID,
|
||||
response=None,
|
||||
)
|
||||
|
||||
# We need to yield this for the caller to extract document order
|
||||
minimal_docs = [
|
||||
llm_doc_from_inference_section(section) for section in minimal_sections
|
||||
]
|
||||
yield ToolResponse(id=FINAL_CONTEXT_DOCUMENTS_ID, response=minimal_docs)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
logger.info(f"Fast path: Completed ordering-only search in {total_time:.2f}s")
|
||||
|
||||
|
||||
# Allows yielding the same responses as a SearchTool without being a SearchTool.
|
||||
# SearchTool passed in to allow for access to SearchTool properties.
|
||||
@ -405,6 +535,10 @@ def yield_search_responses(
|
||||
get_section_relevance: Callable[[], list[SectionRelevancePiece] | None],
|
||||
search_tool: SearchTool,
|
||||
) -> Generator[ToolResponse, None, None]:
|
||||
# Get the search query to check if we're in ordering-only mode
|
||||
# We can infer this from the reranked_sections not containing any relevance scoring
|
||||
is_ordering_only = search_tool.evaluation_type == LLMEvaluationType.SKIP
|
||||
|
||||
yield ToolResponse(
|
||||
id=SEARCH_RESPONSE_SUMMARY_ID,
|
||||
response=SearchResponseSummary(
|
||||
@ -417,25 +551,48 @@ def yield_search_responses(
|
||||
),
|
||||
)
|
||||
|
||||
section_relevance = get_section_relevance()
|
||||
yield ToolResponse(
|
||||
id=SECTION_RELEVANCE_LIST_ID,
|
||||
response=section_relevance,
|
||||
)
|
||||
section_relevance: list[SectionRelevancePiece] | None = None
|
||||
|
||||
# Skip section relevance in ordering-only mode
|
||||
if is_ordering_only:
|
||||
logger.info(
|
||||
"Fast path: Skipping section relevance evaluation in yield_search_responses"
|
||||
)
|
||||
yield ToolResponse(
|
||||
id=SECTION_RELEVANCE_LIST_ID,
|
||||
response=None,
|
||||
)
|
||||
else:
|
||||
section_relevance = get_section_relevance()
|
||||
yield ToolResponse(
|
||||
id=SECTION_RELEVANCE_LIST_ID,
|
||||
response=section_relevance,
|
||||
)
|
||||
|
||||
final_context_sections = get_final_context_sections()
|
||||
pruned_sections = prune_sections(
|
||||
sections=final_context_sections,
|
||||
section_relevance_list=section_relevance_list_impl(
|
||||
section_relevance, final_context_sections
|
||||
),
|
||||
prompt_config=search_tool.prompt_config,
|
||||
llm_config=search_tool.llm.config,
|
||||
question=query,
|
||||
contextual_pruning_config=search_tool.contextual_pruning_config,
|
||||
)
|
||||
|
||||
llm_docs = [llm_doc_from_inference_section(section) for section in pruned_sections]
|
||||
# Skip pruning sections in ordering-only mode
|
||||
if is_ordering_only:
|
||||
logger.info("Fast path: Skipping section pruning in ordering-only mode")
|
||||
llm_docs = [
|
||||
llm_doc_from_inference_section(section)
|
||||
for section in final_context_sections
|
||||
]
|
||||
else:
|
||||
# Use the section_relevance we already computed above
|
||||
pruned_sections = prune_sections(
|
||||
sections=final_context_sections,
|
||||
section_relevance_list=section_relevance_list_impl(
|
||||
section_relevance, final_context_sections
|
||||
),
|
||||
prompt_config=search_tool.prompt_config,
|
||||
llm_config=search_tool.llm.config,
|
||||
question=query,
|
||||
contextual_pruning_config=search_tool.contextual_pruning_config,
|
||||
)
|
||||
llm_docs = [
|
||||
llm_doc_from_inference_section(section) for section in pruned_sections
|
||||
]
|
||||
|
||||
yield ToolResponse(id=FINAL_CONTEXT_DOCUMENTS_ID, response=llm_docs)
|
||||
|
||||
|
@ -5,17 +5,19 @@ Usage:
|
||||
python vespa_debug_tool.py --action <action> [options]
|
||||
|
||||
Actions:
|
||||
config : Print Vespa configuration
|
||||
connect : Check Vespa connectivity
|
||||
list_docs : List documents
|
||||
search : Search documents
|
||||
update : Update a document
|
||||
delete : Delete a document
|
||||
get_acls : Get document ACLs
|
||||
config : Print Vespa configuration
|
||||
connect : Check Vespa connectivity
|
||||
list_docs : List documents
|
||||
list_connector : List documents for a specific connector-credential pair
|
||||
search : Search documents
|
||||
update : Update a document
|
||||
delete : Delete a document
|
||||
get_acls : Get document ACLs
|
||||
|
||||
Options:
|
||||
--tenant-id : Tenant ID
|
||||
--connector-id : Connector ID
|
||||
--cc-pair-id : Connector-Credential Pair ID
|
||||
--n : Number of documents (default 10)
|
||||
--query : Search query
|
||||
--doc-id : Document ID
|
||||
@ -23,6 +25,7 @@ Options:
|
||||
|
||||
Example:
|
||||
python vespa_debug_tool.py --action list_docs --tenant-id my_tenant --connector-id 1 --n 5
|
||||
python vespa_debug_tool.py --action list_connector --tenant-id my_tenant --cc-pair-id 1 --n 5
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
@ -59,7 +62,6 @@ from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import VESPA_APP_CONTAINER_URL
|
||||
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
|
||||
from onyx.utils.logger import setup_logger
|
||||
@ -108,8 +110,8 @@ def build_vespa_filters(
|
||||
if not include_hidden:
|
||||
filter_str += f"AND !({HIDDEN}=true) "
|
||||
|
||||
if filters.tenant_id and MULTI_TENANT:
|
||||
filter_str += f'AND ({TENANT_ID} contains "{filters.tenant_id}") '
|
||||
# if filters.tenant_id and MULTI_TENANT:
|
||||
# filter_str += f'AND ({TENANT_ID} contains "{filters.tenant_id}") '
|
||||
|
||||
if filters.access_control_list is not None:
|
||||
acl_str = _build_or_filters(ACCESS_CONTROL_LIST, filters.access_control_list)
|
||||
@ -269,8 +271,8 @@ def search_for_document(
|
||||
if document_id is not None:
|
||||
conditions.append(f'document_id contains "{document_id}"')
|
||||
|
||||
if tenant_id is not None:
|
||||
conditions.append(f'tenant_id contains "{tenant_id}"')
|
||||
# if tenant_id is not None:
|
||||
# conditions.append(f'tenant_id contains "{tenant_id}"')
|
||||
|
||||
if conditions:
|
||||
yql_query += " where " + " and ".join(conditions)
|
||||
@ -336,8 +338,8 @@ def list_documents(n: int = 10, tenant_id: Optional[str] = None) -> None:
|
||||
# List documents from any source, filtered by tenant if provided.
|
||||
logger.info(f"Listing up to {n} documents for tenant={tenant_id or 'ALL'}")
|
||||
yql = "select * from sources * where true"
|
||||
if tenant_id:
|
||||
yql += f" and tenant_id contains '{tenant_id}'"
|
||||
# if tenant_id:
|
||||
# yql += f" and tenant_id contains '{tenant_id}'"
|
||||
documents = query_vespa(yql, tenant_id=tenant_id, limit=n)
|
||||
print(f"Total documents found: {len(documents)}")
|
||||
logger.info(f"Total documents found: {len(documents)}")
|
||||
@ -444,12 +446,15 @@ def get_document_acls(
|
||||
response = vespa_client.get(document_url)
|
||||
if response.status_code == 200:
|
||||
fields = response.json().get("fields", {})
|
||||
|
||||
document_id = fields.get("document_id") or fields.get(
|
||||
"documentid", "Unknown"
|
||||
)
|
||||
acls = fields.get("access_control_list", {})
|
||||
title = fields.get("title", "")
|
||||
source_type = fields.get("source_type", "")
|
||||
doc_sets = fields.get("document_sets", [])
|
||||
user_file = fields.get("user_file", None)
|
||||
source_links_raw = fields.get("source_links", "{}")
|
||||
try:
|
||||
source_links = json.loads(source_links_raw)
|
||||
@ -462,6 +467,8 @@ def get_document_acls(
|
||||
print(f"Source Links: {source_links}")
|
||||
print(f"Title: {title}")
|
||||
print(f"Source Type: {source_type}")
|
||||
print(f"Document Sets: {doc_sets}")
|
||||
print(f"User File: {user_file}")
|
||||
if MULTI_TENANT:
|
||||
print(f"Tenant ID: {fields.get('tenant_id', 'N/A')}")
|
||||
print("-" * 80)
|
||||
@ -576,6 +583,90 @@ class VespaDebugging:
|
||||
# List documents for a tenant.
|
||||
list_documents(n, self.tenant_id)
|
||||
|
||||
def list_connector(self, cc_pair_id: int, n: int = 10) -> None:
|
||||
# List documents for a specific connector-credential pair in the tenant
|
||||
logger.info(
|
||||
f"Listing documents for tenant={self.tenant_id}, cc_pair_id={cc_pair_id}"
|
||||
)
|
||||
|
||||
# Get document IDs for this connector-credential pair
|
||||
with get_session_with_tenant(tenant_id=self.tenant_id) as session:
|
||||
# First get the connector_id from the cc_pair_id
|
||||
cc_pair = (
|
||||
session.query(ConnectorCredentialPair)
|
||||
.filter(ConnectorCredentialPair.id == cc_pair_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not cc_pair:
|
||||
print(f"No connector-credential pair found with ID {cc_pair_id}")
|
||||
return
|
||||
|
||||
connector_id = cc_pair.connector_id
|
||||
|
||||
# Now get document IDs for this connector
|
||||
doc_ids_data = (
|
||||
session.query(DocumentByConnectorCredentialPair.id)
|
||||
.filter(DocumentByConnectorCredentialPair.connector_id == connector_id)
|
||||
.distinct()
|
||||
.all()
|
||||
)
|
||||
|
||||
doc_ids = [doc_id[0] for doc_id in doc_ids_data]
|
||||
|
||||
if not doc_ids:
|
||||
print(f"No documents found for connector-credential pair ID {cc_pair_id}")
|
||||
return
|
||||
|
||||
print(
|
||||
f"Found {len(doc_ids)} documents for connector-credential pair ID {cc_pair_id}"
|
||||
)
|
||||
|
||||
# Limit to the first n document IDs
|
||||
target_doc_ids = doc_ids[:n]
|
||||
print(f"Retrieving details for first {len(target_doc_ids)} documents")
|
||||
# Search for each document in Vespa
|
||||
for doc_id in target_doc_ids:
|
||||
docs = search_for_document(self.index_name, doc_id, self.tenant_id)
|
||||
if not docs:
|
||||
print(f"No chunks found in Vespa for document ID: {doc_id}")
|
||||
continue
|
||||
|
||||
print(f"Document ID: {doc_id}")
|
||||
print(f"Found {len(docs)} chunks in Vespa")
|
||||
|
||||
# Print each chunk with all fields except embeddings
|
||||
for i, doc in enumerate(docs):
|
||||
print(f" Chunk {i+1}:")
|
||||
fields = doc.get("fields", {})
|
||||
|
||||
# Print all fields except embeddings
|
||||
for field_name, field_value in sorted(fields.items()):
|
||||
# Skip embedding fields
|
||||
if "embedding" in field_name:
|
||||
continue
|
||||
|
||||
# Format the output based on field type
|
||||
if isinstance(field_value, dict) or isinstance(field_value, list):
|
||||
# Truncate dictionaries and lists
|
||||
truncated = (
|
||||
str(field_value)[:50] + "..."
|
||||
if len(str(field_value)) > 50
|
||||
else str(field_value)
|
||||
)
|
||||
print(f" {field_name}: {truncated}")
|
||||
else:
|
||||
# Truncate strings and other values
|
||||
str_value = str(field_value)
|
||||
truncated = (
|
||||
str_value[:50] + "..." if len(str_value) > 50 else str_value
|
||||
)
|
||||
print(f" {field_name}: {truncated}")
|
||||
|
||||
print("-" * 40) # Separator between chunks
|
||||
|
||||
print("=" * 80) # Separator between documents
|
||||
|
||||
def compare_chunk_count(self, document_id: str) -> tuple[int, int]:
|
||||
docs = search_for_document(self.index_name, document_id, max_hits=None)
|
||||
number_of_chunks_we_think_exist = get_number_of_chunks_we_think_exist(
|
||||
@ -770,6 +861,7 @@ def main() -> None:
|
||||
"config",
|
||||
"connect",
|
||||
"list_docs",
|
||||
"list_connector",
|
||||
"search",
|
||||
"update",
|
||||
"delete",
|
||||
@ -781,6 +873,7 @@ def main() -> None:
|
||||
)
|
||||
parser.add_argument("--tenant-id", help="Tenant ID")
|
||||
parser.add_argument("--connector-id", type=int, help="Connector ID")
|
||||
parser.add_argument("--cc-pair-id", type=int, help="Connector-Credential Pair ID")
|
||||
parser.add_argument(
|
||||
"--n", type=int, default=10, help="Number of documents to retrieve"
|
||||
)
|
||||
@ -809,6 +902,10 @@ def main() -> None:
|
||||
vespa_debug.check_connectivity()
|
||||
elif args.action == "list_docs":
|
||||
vespa_debug.list_documents(args.n)
|
||||
elif args.action == "list_connector":
|
||||
if args.cc_pair_id is None:
|
||||
parser.error("--cc-pair-id is required for list_connector action")
|
||||
vespa_debug.list_connector(args.cc_pair_id, args.n)
|
||||
elif args.action == "search":
|
||||
if not args.query or args.connector_id is None:
|
||||
parser.error("--query and --connector-id are required for search action")
|
||||
@ -825,9 +922,9 @@ def main() -> None:
|
||||
parser.error("--doc-id and --connector-id are required for delete action")
|
||||
vespa_debug.delete_document(args.connector_id, args.doc_id)
|
||||
elif args.action == "get_acls":
|
||||
if args.connector_id is None:
|
||||
parser.error("--connector-id is required for get_acls action")
|
||||
vespa_debug.acls(args.connector_id, args.n)
|
||||
if args.cc_pair_id is None:
|
||||
parser.error("--cc-pair-id is required for get_acls action")
|
||||
vespa_debug.acls(args.cc_pair_id, args.n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -72,6 +72,19 @@ def run_jobs() -> None:
|
||||
"--queues=connector_indexing",
|
||||
]
|
||||
|
||||
cmd_worker_user_files_indexing = [
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=user_files_indexing@%n",
|
||||
"--queues=user_files_indexing",
|
||||
]
|
||||
|
||||
cmd_worker_monitoring = [
|
||||
"celery",
|
||||
"-A",
|
||||
@ -110,6 +123,13 @@ def run_jobs() -> None:
|
||||
cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
|
||||
worker_user_files_indexing_process = subprocess.Popen(
|
||||
cmd_worker_user_files_indexing,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
|
||||
worker_monitoring_process = subprocess.Popen(
|
||||
cmd_worker_monitoring,
|
||||
stdout=subprocess.PIPE,
|
||||
@ -134,6 +154,10 @@ def run_jobs() -> None:
|
||||
worker_indexing_thread = threading.Thread(
|
||||
target=monitor_process, args=("INDEX", worker_indexing_process)
|
||||
)
|
||||
worker_user_files_indexing_thread = threading.Thread(
|
||||
target=monitor_process,
|
||||
args=("USER_FILES_INDEX", worker_user_files_indexing_process),
|
||||
)
|
||||
worker_monitoring_thread = threading.Thread(
|
||||
target=monitor_process, args=("MONITORING", worker_monitoring_process)
|
||||
)
|
||||
@ -143,6 +167,7 @@ def run_jobs() -> None:
|
||||
worker_light_thread.start()
|
||||
worker_heavy_thread.start()
|
||||
worker_indexing_thread.start()
|
||||
worker_user_files_indexing_thread.start()
|
||||
worker_monitoring_thread.start()
|
||||
beat_thread.start()
|
||||
|
||||
@ -150,6 +175,7 @@ def run_jobs() -> None:
|
||||
worker_light_thread.join()
|
||||
worker_heavy_thread.join()
|
||||
worker_indexing_thread.join()
|
||||
worker_user_files_indexing_thread.join()
|
||||
worker_monitoring_thread.join()
|
||||
beat_thread.join()
|
||||
|
||||
|
@ -93,6 +93,8 @@ def generate_dummy_chunk(
|
||||
|
||||
return DocMetadataAwareIndexChunk.from_index_chunk(
|
||||
index_chunk=chunk,
|
||||
user_file=None,
|
||||
user_folder=None,
|
||||
access=DocumentAccess.build(
|
||||
user_emails=user_emails,
|
||||
user_groups=user_groups,
|
||||
|
@ -65,6 +65,18 @@ autorestart=true
|
||||
startsecs=10
|
||||
stopasgroup=true
|
||||
|
||||
[program:celery_worker_user_files_indexing]
|
||||
command=celery -A onyx.background.celery.versioned_apps.indexing worker
|
||||
--loglevel=INFO
|
||||
--hostname=user_files_indexing@%%n
|
||||
-Q user_files_indexing
|
||||
stdout_logfile=/var/log/celery_worker_user_files_indexing.log
|
||||
stdout_logfile_maxbytes=16MB
|
||||
redirect_stderr=true
|
||||
autorestart=true
|
||||
startsecs=10
|
||||
stopasgroup=true
|
||||
|
||||
[program:celery_worker_monitoring]
|
||||
command=celery -A onyx.background.celery.versioned_apps.monitoring worker
|
||||
--loglevel=INFO
|
||||
@ -108,6 +120,7 @@ command=tail -qF
|
||||
/var/log/celery_worker_light.log
|
||||
/var/log/celery_worker_heavy.log
|
||||
/var/log/celery_worker_indexing.log
|
||||
/var/log/celery_worker_user_files_indexing.log
|
||||
/var/log/celery_worker_monitoring.log
|
||||
/var/log/slack_bot.log
|
||||
stdout_logfile=/dev/stdout
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
ADMIN_USER_NAME = "admin_user"
|
||||
|
||||
API_SERVER_PROTOCOL = os.getenv("API_SERVER_PROTOCOL") or "http"
|
||||
API_SERVER_HOST = os.getenv("API_SERVER_HOST") or "localhost"
|
||||
API_SERVER_HOST = os.getenv("API_SERVER_HOST") or "127.0.0.1"
|
||||
API_SERVER_PORT = os.getenv("API_SERVER_PORT") or "8080"
|
||||
API_SERVER_URL = f"{API_SERVER_PROTOCOL}://{API_SERVER_HOST}:{API_SERVER_PORT}"
|
||||
MAX_DELAY = 60
|
||||
|
@ -166,18 +166,24 @@ class DocumentManager:
|
||||
}
|
||||
|
||||
# Left this here for debugging purposes.
|
||||
# import json
|
||||
# for doc in retrieved_docs.values():
|
||||
# printable_doc = doc.copy()
|
||||
# print(printable_doc.keys())
|
||||
# printable_doc.pop("embeddings")
|
||||
# printable_doc.pop("title_embedding")
|
||||
# print(json.dumps(printable_doc, indent=2))
|
||||
import json
|
||||
|
||||
print("DEBUGGING DOCUMENTS")
|
||||
print(retrieved_docs)
|
||||
for doc in retrieved_docs.values():
|
||||
printable_doc = doc.copy()
|
||||
print(printable_doc.keys())
|
||||
printable_doc.pop("embeddings")
|
||||
printable_doc.pop("title_embedding")
|
||||
print(json.dumps(printable_doc, indent=2))
|
||||
|
||||
for document in cc_pair.documents:
|
||||
retrieved_doc = retrieved_docs.get(document.id)
|
||||
if not retrieved_doc:
|
||||
if not verify_deleted:
|
||||
print(f"Document not found: {document.id}")
|
||||
print(retrieved_docs.keys())
|
||||
print(retrieved_docs.values())
|
||||
raise ValueError(f"Document not found: {document.id}")
|
||||
continue
|
||||
if verify_deleted:
|
||||
|
@ -139,11 +139,23 @@ class DocumentSetManager:
|
||||
break
|
||||
|
||||
if time.time() - start > MAX_DELAY:
|
||||
not_synced_doc_sets = [
|
||||
doc_set for doc_set in doc_sets if not doc_set.is_up_to_date
|
||||
]
|
||||
raise TimeoutError(
|
||||
f"Document sets were not synced within the {MAX_DELAY} seconds"
|
||||
f"Document sets were not synced within the {MAX_DELAY} seconds. "
|
||||
f"Remaining unsynced document sets: {len(not_synced_doc_sets)}. "
|
||||
f"IDs: {[doc_set.id for doc_set in not_synced_doc_sets]}"
|
||||
)
|
||||
else:
|
||||
print("Document sets were not synced yet, waiting...")
|
||||
not_synced_doc_sets = [
|
||||
doc_set for doc_set in doc_sets if not doc_set.is_up_to_date
|
||||
]
|
||||
print(
|
||||
f"Document sets were not synced yet, waiting... "
|
||||
f"{len(not_synced_doc_sets)}/{len(doc_sets)} document sets still syncing. "
|
||||
f"IDs: {[doc_set.id for doc_set in not_synced_doc_sets]}"
|
||||
)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
|
@ -176,4 +176,3 @@ ENV NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY=${NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY}
|
||||
# expose via cli
|
||||
|
||||
CMD ["node", "server.js"]
|
||||
|
||||
|
@ -37,6 +37,7 @@ const nextConfig = {
|
||||
pathname: "/s2/favicons/**",
|
||||
},
|
||||
],
|
||||
unoptimized: true, // Disable image optimization to avoid requiring Sharp
|
||||
},
|
||||
async headers() {
|
||||
return [
|
||||
|
BIN
web/public/Amazon.png
Normal file
After Width: | Height: | Size: 34 KiB |
Before Width: | Height: | Size: 9.4 KiB |
BIN
web/public/Google.png
Normal file
After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 6.4 KiB |
BIN
web/public/Productboard.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 718 B |
BIN
web/public/discord.png
Normal file
After Width: | Height: | Size: 6.2 KiB |
Before Width: | Height: | Size: 4.0 KiB |
@ -64,10 +64,10 @@ import { debounce } from "lodash";
|
||||
import { LLMProviderView } from "../configuration/llm/interfaces";
|
||||
import StarterMessagesList from "./StarterMessageList";
|
||||
|
||||
import { Switch, SwitchField } from "@/components/ui/switch";
|
||||
import { SwitchField } from "@/components/ui/switch";
|
||||
import { generateIdenticon } from "@/components/assistants/AssistantIcon";
|
||||
import { BackButton } from "@/components/BackButton";
|
||||
import { Checkbox, CheckboxField } from "@/components/ui/checkbox";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import { AdvancedOptionsToggle } from "@/components/AdvancedOptionsToggle";
|
||||
import { MinimalUserSnapshot } from "@/lib/types";
|
||||
import { useUserGroups } from "@/lib/hooks";
|
||||
@ -76,13 +76,31 @@ import {
|
||||
Option as DropdownOption,
|
||||
} from "@/components/Dropdown";
|
||||
import { SourceChip } from "@/app/chat/input/ChatInputBar";
|
||||
import { TagIcon, UserIcon, XIcon, InfoIcon } from "lucide-react";
|
||||
import {
|
||||
TagIcon,
|
||||
UserIcon,
|
||||
FileIcon,
|
||||
FolderIcon,
|
||||
InfoIcon,
|
||||
BookIcon,
|
||||
} from "lucide-react";
|
||||
import { LLMSelector } from "@/components/llm/LLMSelector";
|
||||
import useSWR from "swr";
|
||||
import { errorHandlingFetcher } from "@/lib/fetcher";
|
||||
import { ConfirmEntityModal } from "@/components/modals/ConfirmEntityModal";
|
||||
import Title from "@/components/ui/title";
|
||||
|
||||
import { FilePickerModal } from "@/app/chat/my-documents/components/FilePicker";
|
||||
import { useDocumentsContext } from "@/app/chat/my-documents/DocumentsContext";
|
||||
import {
|
||||
FileResponse,
|
||||
FolderResponse,
|
||||
} from "@/app/chat/my-documents/DocumentsContext";
|
||||
import { RadioGroup } from "@/components/ui/radio-group";
|
||||
import { RadioGroupItemField } from "@/components/ui/RadioGroupItemField";
|
||||
import { SEARCH_TOOL_ID } from "@/app/chat/tools/constants";
|
||||
import TextView from "@/components/chat/TextView";
|
||||
import { MinimalOnyxDocument } from "@/lib/search/interfaces";
|
||||
import { TabToggle } from "@/components/ui/TabToggle";
|
||||
|
||||
function findSearchTool(tools: ToolSnapshot[]) {
|
||||
return tools.find((tool) => tool.in_code_tool_id === SEARCH_TOOL_ID);
|
||||
@ -147,6 +165,9 @@ export function AssistantEditor({
|
||||
"#6FFFFF",
|
||||
];
|
||||
|
||||
const [presentingDocument, setPresentingDocument] =
|
||||
useState<MinimalOnyxDocument | null>(null);
|
||||
const [filePickerModalOpen, setFilePickerModalOpen] = useState(false);
|
||||
const [showAdvancedOptions, setShowAdvancedOptions] = useState(false);
|
||||
|
||||
// state to persist across formik reformatting
|
||||
@ -221,6 +242,16 @@ export function AssistantEditor({
|
||||
enabledToolsMap[tool.id] = personaCurrentToolIds.includes(tool.id);
|
||||
});
|
||||
|
||||
const {
|
||||
selectedFiles,
|
||||
selectedFolders,
|
||||
addSelectedFile,
|
||||
removeSelectedFile,
|
||||
addSelectedFolder,
|
||||
removeSelectedFolder,
|
||||
clearSelectedItems,
|
||||
} = useDocumentsContext();
|
||||
|
||||
const [showVisibilityWarning, setShowVisibilityWarning] = useState(false);
|
||||
|
||||
const initialValues = {
|
||||
@ -259,6 +290,9 @@ export function AssistantEditor({
|
||||
(u) => u.id !== existingPersona.owner?.id
|
||||
) ?? [],
|
||||
selectedGroups: existingPersona?.groups ?? [],
|
||||
user_file_ids: existingPersona?.user_file_ids ?? [],
|
||||
user_folder_ids: existingPersona?.user_folder_ids ?? [],
|
||||
knowledge_source: "user_files",
|
||||
is_default_persona: existingPersona?.is_default_persona ?? false,
|
||||
};
|
||||
|
||||
@ -352,6 +386,10 @@ export function AssistantEditor({
|
||||
}
|
||||
}
|
||||
};
|
||||
const canShowKnowledgeSource =
|
||||
ccPairs.length > 0 &&
|
||||
searchTool &&
|
||||
!(user?.role != "admin" && documentSets.length === 0);
|
||||
|
||||
return (
|
||||
<div className="mx-auto max-w-4xl">
|
||||
@ -368,7 +406,26 @@ export function AssistantEditor({
|
||||
<BackButton />
|
||||
</div>
|
||||
)}
|
||||
{filePickerModalOpen && (
|
||||
<FilePickerModal
|
||||
setPresentingDocument={setPresentingDocument}
|
||||
isOpen={filePickerModalOpen}
|
||||
onClose={() => {
|
||||
setFilePickerModalOpen(false);
|
||||
}}
|
||||
onSave={() => {
|
||||
setFilePickerModalOpen(false);
|
||||
}}
|
||||
buttonContent="Add to Assistant"
|
||||
/>
|
||||
)}
|
||||
|
||||
{presentingDocument && (
|
||||
<TextView
|
||||
presentingDocument={presentingDocument}
|
||||
onClose={() => setPresentingDocument(null)}
|
||||
/>
|
||||
)}
|
||||
{labelToDelete && (
|
||||
<ConfirmEntityModal
|
||||
entityType="label"
|
||||
@ -434,6 +491,7 @@ export function AssistantEditor({
|
||||
label_ids: Yup.array().of(Yup.number()),
|
||||
selectedUsers: Yup.array().of(Yup.object()),
|
||||
selectedGroups: Yup.array().of(Yup.number()),
|
||||
knowledge_source: Yup.string().required(),
|
||||
is_default_persona: Yup.boolean().required(),
|
||||
})
|
||||
.test(
|
||||
@ -522,9 +580,12 @@ export function AssistantEditor({
|
||||
? new Date(values.search_start_date)
|
||||
: null,
|
||||
num_chunks: numChunks,
|
||||
user_file_ids: selectedFiles.map((file) => file.id),
|
||||
user_folder_ids: selectedFolders.map((folder) => folder.id),
|
||||
};
|
||||
|
||||
let personaResponse;
|
||||
|
||||
if (isUpdate) {
|
||||
personaResponse = await updatePersona(
|
||||
existingPersona.id,
|
||||
@ -792,10 +853,7 @@ export function AssistantEditor({
|
||||
<Separator />
|
||||
<div className="flex gap-x-2 py-2 flex justify-start">
|
||||
<div>
|
||||
<div
|
||||
className="flex items-start gap-x-2
|
||||
"
|
||||
>
|
||||
<div className="flex items-start gap-x-2">
|
||||
<p className="block font-medium text-sm">
|
||||
Knowledge
|
||||
</p>
|
||||
@ -834,92 +892,170 @@ export function AssistantEditor({
|
||||
</TooltipProvider>
|
||||
</div>
|
||||
</div>
|
||||
<p className="text-sm text-neutral-700 dark:text-neutral-400">
|
||||
Attach additional unique knowledge to this assistant
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
{ccPairs.length > 0 &&
|
||||
searchTool &&
|
||||
values.enabled_tools_map[searchTool.id] &&
|
||||
!(user?.role != "admin" && documentSets.length === 0) && (
|
||||
<CollapsibleSection>
|
||||
<div className="mt-2">
|
||||
{ccPairs.length > 0 && (
|
||||
<>
|
||||
<Label small>Document Sets</Label>
|
||||
<div>
|
||||
<SubLabel>
|
||||
<>
|
||||
Select which{" "}
|
||||
{!user || user.role === "admin" ? (
|
||||
<Link
|
||||
href="/admin/documents/sets"
|
||||
className="font-semibold underline hover:underline text-text"
|
||||
target="_blank"
|
||||
>
|
||||
Document Sets
|
||||
</Link>
|
||||
) : (
|
||||
"Document Sets"
|
||||
)}{" "}
|
||||
this Assistant should use to inform its
|
||||
responses. If none are specified, the
|
||||
Assistant will reference all available
|
||||
documents.
|
||||
</>
|
||||
</SubLabel>
|
||||
{searchTool && values.enabled_tools_map[searchTool.id] && (
|
||||
<div>
|
||||
{canShowKnowledgeSource && (
|
||||
<>
|
||||
<div className="mt-1.5 mb-2.5">
|
||||
<div className="flex gap-2.5">
|
||||
<div
|
||||
className={`w-[150px] h-[110px] rounded-lg border flex flex-col items-center justify-center cursor-pointer transition-all ${
|
||||
values.knowledge_source === "user_files"
|
||||
? "border-2 border-blue-500 bg-blue-50 dark:bg-blue-950/20"
|
||||
: "border-gray-200 hover:border-gray-300 dark:border-gray-700 dark:hover:border-gray-600"
|
||||
}`}
|
||||
onClick={() =>
|
||||
setFieldValue(
|
||||
"knowledge_source",
|
||||
"user_files"
|
||||
)
|
||||
}
|
||||
>
|
||||
<div className="text-blue-500 mb-2">
|
||||
<FileIcon size={24} />
|
||||
</div>
|
||||
<p className="font-medium text-xs">
|
||||
User Knowledge
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{documentSets.length > 0 ? (
|
||||
<FieldArray
|
||||
name="document_set_ids"
|
||||
render={(arrayHelpers: ArrayHelpers) => (
|
||||
<div>
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{documentSets.map((documentSet) => (
|
||||
<DocumentSetSelectable
|
||||
key={documentSet.id}
|
||||
documentSet={documentSet}
|
||||
isSelected={values.document_set_ids.includes(
|
||||
documentSet.id
|
||||
)}
|
||||
onSelect={() => {
|
||||
const index =
|
||||
values.document_set_ids.indexOf(
|
||||
documentSet.id
|
||||
);
|
||||
if (index !== -1) {
|
||||
arrayHelpers.remove(index);
|
||||
} else {
|
||||
arrayHelpers.push(
|
||||
documentSet.id
|
||||
);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
/>
|
||||
) : (
|
||||
<p className="text-sm">
|
||||
<Link
|
||||
href="/admin/documents/sets/new"
|
||||
className="text-primary hover:underline"
|
||||
>
|
||||
+ Create Document Set
|
||||
</Link>
|
||||
<div
|
||||
className={`w-[150px] h-[110px] rounded-lg border flex flex-col items-center justify-center cursor-pointer transition-all ${
|
||||
values.knowledge_source === "team_knowledge"
|
||||
? "border-2 border-blue-500 bg-blue-50 dark:bg-blue-950/20"
|
||||
: "border-gray-200 hover:border-gray-300 dark:border-gray-700 dark:hover:border-gray-600"
|
||||
}`}
|
||||
onClick={() =>
|
||||
setFieldValue(
|
||||
"knowledge_source",
|
||||
"team_knowledge"
|
||||
)
|
||||
}
|
||||
>
|
||||
<div className="text-blue-500 mb-2">
|
||||
<BookIcon size={24} />
|
||||
</div>
|
||||
<p className="font-medium text-xs">
|
||||
Team Knowledge
|
||||
</p>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</CollapsibleSection>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
|
||||
{values.knowledge_source === "user_files" &&
|
||||
!existingPersona?.is_default_persona &&
|
||||
!admin && (
|
||||
<div className="text-sm flex flex-col items-start">
|
||||
<SubLabel>
|
||||
Click below to add documents or folders from the
|
||||
My Document feature
|
||||
</SubLabel>
|
||||
{(selectedFiles.length > 0 ||
|
||||
selectedFolders.length > 0) && (
|
||||
<div className="flex flex-wrap mb-2 max-w-sm gap-2">
|
||||
{selectedFiles.map((file) => (
|
||||
<SourceChip
|
||||
key={file.id}
|
||||
onRemove={() => {}}
|
||||
title={file.name}
|
||||
icon={<FileIcon size={16} />}
|
||||
/>
|
||||
))}
|
||||
{selectedFolders.map((folder) => (
|
||||
<SourceChip
|
||||
key={folder.id}
|
||||
onRemove={() => {}}
|
||||
title={folder.name}
|
||||
icon={<FolderIcon size={16} />}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
<button
|
||||
onClick={() => setFilePickerModalOpen(true)}
|
||||
className="text-primary hover:underline"
|
||||
>
|
||||
+ Add User Files
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{values.knowledge_source === "team_knowledge" &&
|
||||
ccPairs.length > 0 && (
|
||||
<div className="mt-4">
|
||||
<div>
|
||||
<SubLabel>
|
||||
<>
|
||||
Select which{" "}
|
||||
{!user || user.role === "admin" ? (
|
||||
<Link
|
||||
href="/admin/documents/sets"
|
||||
className="font-semibold underline hover:underline text-text"
|
||||
target="_blank"
|
||||
>
|
||||
Document Sets
|
||||
</Link>
|
||||
) : (
|
||||
"Team Document Sets"
|
||||
)}{" "}
|
||||
this Assistant should use to inform its
|
||||
responses. If none are specified, the
|
||||
Assistant will reference all available
|
||||
documents.
|
||||
</>
|
||||
</SubLabel>
|
||||
</div>
|
||||
|
||||
{documentSets.length > 0 ? (
|
||||
<FieldArray
|
||||
name="document_set_ids"
|
||||
render={(arrayHelpers: ArrayHelpers) => (
|
||||
<div>
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{documentSets.map((documentSet) => (
|
||||
<DocumentSetSelectable
|
||||
key={documentSet.id}
|
||||
documentSet={documentSet}
|
||||
isSelected={values.document_set_ids.includes(
|
||||
documentSet.id
|
||||
)}
|
||||
onSelect={() => {
|
||||
const index =
|
||||
values.document_set_ids.indexOf(
|
||||
documentSet.id
|
||||
);
|
||||
if (index !== -1) {
|
||||
arrayHelpers.remove(index);
|
||||
} else {
|
||||
arrayHelpers.push(documentSet.id);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
/>
|
||||
) : (
|
||||
<p className="text-sm">
|
||||
<Link
|
||||
href="/admin/documents/sets/new"
|
||||
className="text-primary hover:underline"
|
||||
>
|
||||
+ Create Document Set
|
||||
</Link>
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<Separator />
|
||||
<div className="py-2">
|
||||
|
106
web/src/app/admin/assistants/assistantFileUtils.ts
Normal file
@ -0,0 +1,106 @@
|
||||
import {
|
||||
FileResponse,
|
||||
FolderResponse,
|
||||
} from "@/app/chat/my-documents/DocumentsContext";
|
||||
|
||||
export interface AssistantFileChanges {
|
||||
filesToShare: number[];
|
||||
filesToUnshare: number[];
|
||||
foldersToShare: number[];
|
||||
foldersToUnshare: number[];
|
||||
}
|
||||
|
||||
export function calculateFileChanges(
|
||||
existingFileIds: number[],
|
||||
existingFolderIds: number[],
|
||||
selectedFiles: FileResponse[],
|
||||
selectedFolders: FolderResponse[]
|
||||
): AssistantFileChanges {
|
||||
const selectedFileIds = selectedFiles.map((file) => file.id);
|
||||
const selectedFolderIds = selectedFolders.map((folder) => folder.id);
|
||||
|
||||
return {
|
||||
filesToShare: selectedFileIds.filter((id) => !existingFileIds.includes(id)),
|
||||
filesToUnshare: existingFileIds.filter(
|
||||
(id) => !selectedFileIds.includes(id)
|
||||
),
|
||||
foldersToShare: selectedFolderIds.filter(
|
||||
(id) => !existingFolderIds.includes(id)
|
||||
),
|
||||
foldersToUnshare: existingFolderIds.filter(
|
||||
(id) => !selectedFolderIds.includes(id)
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
export async function shareFiles(
|
||||
assistantId: number,
|
||||
fileIds: number[]
|
||||
): Promise<void> {
|
||||
for (const fileId of fileIds) {
|
||||
await fetch(`/api/user/file/${fileId}/share`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ assistant_id: assistantId }),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function unshareFiles(
|
||||
assistantId: number,
|
||||
fileIds: number[]
|
||||
): Promise<void> {
|
||||
for (const fileId of fileIds) {
|
||||
await fetch(`/api/user/file/${fileId}/unshare`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ assistant_id: assistantId }),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function shareFolders(
|
||||
assistantId: number,
|
||||
folderIds: number[]
|
||||
): Promise<void> {
|
||||
for (const folderId of folderIds) {
|
||||
await fetch(`/api/user/folder/${folderId}/share`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ assistant_id: assistantId }),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function unshareFolders(
|
||||
assistantId: number,
|
||||
folderIds: number[]
|
||||
): Promise<void> {
|
||||
for (const folderId of folderIds) {
|
||||
await fetch(`/api/user/folder/${folderId}/unshare`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ assistant_id: assistantId }),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateAssistantFiles(
|
||||
assistantId: number,
|
||||
changes: AssistantFileChanges
|
||||
): Promise<void> {
|
||||
await Promise.all([
|
||||
shareFiles(assistantId, changes.filesToShare),
|
||||
unshareFiles(assistantId, changes.filesToUnshare),
|
||||
shareFolders(assistantId, changes.foldersToShare),
|
||||
unshareFolders(assistantId, changes.foldersToUnshare),
|
||||
]);
|
||||
}
|
@ -45,6 +45,8 @@ export interface Persona {
|
||||
icon_color?: string;
|
||||
uploaded_image_id?: string;
|
||||
labels?: PersonaLabel[];
|
||||
user_file_ids: number[];
|
||||
user_folder_ids: number[];
|
||||
}
|
||||
|
||||
export interface PersonaLabel {
|
||||
|
@ -29,6 +29,8 @@ interface PersonaUpsertRequest {
|
||||
is_default_persona: boolean;
|
||||
display_priority: number | null;
|
||||
label_ids: number[] | null;
|
||||
user_file_ids: number[] | null;
|
||||
user_folder_ids: number[] | null;
|
||||
}
|
||||
|
||||
export interface PersonaUpsertParameters {
|
||||
@ -56,6 +58,8 @@ export interface PersonaUpsertParameters {
|
||||
uploaded_image: File | null;
|
||||
is_default_persona: boolean;
|
||||
label_ids: number[] | null;
|
||||
user_file_ids: number[];
|
||||
user_folder_ids: number[];
|
||||
}
|
||||
|
||||
export const createPersonaLabel = (name: string) => {
|
||||
@ -114,7 +118,10 @@ function buildPersonaUpsertRequest(
|
||||
icon_shape,
|
||||
remove_image,
|
||||
search_start_date,
|
||||
user_file_ids,
|
||||
user_folder_ids,
|
||||
} = creationRequest;
|
||||
|
||||
return {
|
||||
name,
|
||||
description,
|
||||
@ -145,6 +152,8 @@ function buildPersonaUpsertRequest(
|
||||
starter_messages: creationRequest.starter_messages ?? null,
|
||||
display_priority: null,
|
||||
label_ids: creationRequest.label_ids ?? null,
|
||||
user_file_ids: user_file_ids ?? null,
|
||||
user_folder_ids: user_folder_ids ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
@ -175,7 +184,6 @@ export async function createPersona(
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const createPersonaResponse = await fetch("/api/persona", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
@ -345,4 +353,6 @@ export const defaultPersona: Persona = {
|
||||
owner: null,
|
||||
icon_shape: 50910,
|
||||
icon_color: "#FF6F6F",
|
||||
user_file_ids: [],
|
||||
user_folder_ids: [],
|
||||
};
|
||||
|
@ -36,6 +36,12 @@ export interface WellKnownLLMProviderDescriptor {
|
||||
groups: number[];
|
||||
}
|
||||
|
||||
export interface LLMModelDescriptor {
|
||||
modelName: string;
|
||||
provider: string;
|
||||
maxTokens: number;
|
||||
}
|
||||
|
||||
export interface LLMProvider {
|
||||
name: string;
|
||||
provider: string;
|
||||
@ -49,6 +55,7 @@ export interface LLMProvider {
|
||||
groups: number[];
|
||||
display_model_names: string[] | null;
|
||||
deployment_name: string | null;
|
||||
model_token_limits: { [key: string]: number } | null;
|
||||
default_vision_model: string | null;
|
||||
is_default_vision_provider: boolean | null;
|
||||
}
|
||||
@ -74,6 +81,7 @@ export interface LLMProviderDescriptor {
|
||||
is_public: boolean;
|
||||
groups: number[];
|
||||
display_model_names: string[] | null;
|
||||
model_token_limits: { [key: string]: number } | null;
|
||||
}
|
||||
|
||||
export const getProviderIcon = (providerName: string, modelName?: string) => {
|
||||
|
@ -434,7 +434,7 @@ export default function AddConnector({
|
||||
>
|
||||
{(formikProps) => {
|
||||
return (
|
||||
<div className="mx-auto mb-8 w-full">
|
||||
<div className="mx-auto w-full">
|
||||
{popup}
|
||||
|
||||
{uploading && (
|
||||
|
@ -221,6 +221,7 @@ border border-border dark:border-neutral-700
|
||||
<TableCell>
|
||||
{timeAgo(ccPairsIndexingStatus?.last_success) || "-"}
|
||||
</TableCell>
|
||||
|
||||
<TableCell>{getActivityBadge()}</TableCell>
|
||||
{isPaidEnterpriseFeaturesEnabled && (
|
||||
<TableCell>
|
||||
@ -251,12 +252,19 @@ border border-border dark:border-neutral-700
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
{isEditable && (
|
||||
<CustomTooltip content="Manage Connector">
|
||||
<FiSettings
|
||||
className="cursor-pointer"
|
||||
onClick={handleManageClick}
|
||||
/>
|
||||
</CustomTooltip>
|
||||
<TooltipProvider>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<FiSettings
|
||||
className="cursor-pointer"
|
||||
onClick={handleManageClick}
|
||||
/>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent>
|
||||
<p>Manage Connector</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
|
@ -21,18 +21,20 @@ import { HistorySidebar } from "../chat/sessionSidebar/HistorySidebar";
|
||||
import { useAssistants } from "@/components/context/AssistantsContext";
|
||||
import AssistantModal from "./mine/AssistantModal";
|
||||
import { useSidebarShortcut } from "@/lib/browserUtilities";
|
||||
import { UserSettingsModal } from "../chat/modal/UserSettingsModal";
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { useUser } from "@/components/user/UserProvider";
|
||||
|
||||
interface SidebarWrapperProps<T extends object> {
|
||||
initiallyToggled: boolean;
|
||||
size?: "sm" | "lg";
|
||||
children: ReactNode;
|
||||
}
|
||||
|
||||
export default function SidebarWrapper<T extends object>({
|
||||
initiallyToggled,
|
||||
size = "sm",
|
||||
children,
|
||||
}: SidebarWrapperProps<T>) {
|
||||
const { sidebarInitiallyVisible: initiallyToggled } = useChatContext();
|
||||
const [sidebarVisible, setSidebarVisible] = useState(initiallyToggled);
|
||||
const [showDocSidebar, setShowDocSidebar] = useState(false); // State to track if sidebar is open
|
||||
// Used to maintain a "time out" for history sidebar so our existing refs can have time to process change
|
||||
@ -61,6 +63,7 @@ export default function SidebarWrapper<T extends object>({
|
||||
}, 200);
|
||||
};
|
||||
|
||||
const { popup, setPopup } = usePopup();
|
||||
const settings = useContext(SettingsContext);
|
||||
useSidebarVisibility({
|
||||
sidebarVisible,
|
||||
@ -70,13 +73,18 @@ export default function SidebarWrapper<T extends object>({
|
||||
mobile: settings?.isMobile,
|
||||
});
|
||||
|
||||
const { user } = useUser();
|
||||
const [showAssistantsModal, setShowAssistantsModal] = useState(false);
|
||||
const router = useRouter();
|
||||
const [userSettingsToggled, setUserSettingsToggled] = useState(false);
|
||||
|
||||
const { llmProviders } = useChatContext();
|
||||
useSidebarShortcut(router, toggleSidebar);
|
||||
|
||||
return (
|
||||
<div className="flex relative overflow-x-hidden overscroll-contain flex-col w-full h-screen">
|
||||
{popup}
|
||||
|
||||
{showAssistantsModal && (
|
||||
<AssistantModal hideModal={() => setShowAssistantsModal(false)} />
|
||||
)}
|
||||
@ -114,9 +122,19 @@ export default function SidebarWrapper<T extends object>({
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{userSettingsToggled && (
|
||||
<UserSettingsModal
|
||||
setPopup={setPopup}
|
||||
llmProviders={llmProviders}
|
||||
onClose={() => setUserSettingsToggled(false)}
|
||||
defaultModel={user?.preferences?.default_model!}
|
||||
/>
|
||||
)}
|
||||
|
||||
<div className="absolute px-2 left-0 w-full top-0">
|
||||
<FunctionalHeader
|
||||
removeHeight={true}
|
||||
toggleUserSettings={() => setUserSettingsToggled(true)}
|
||||
sidebarToggled={sidebarVisible}
|
||||
toggleSidebar={toggleSidebar}
|
||||
page="chat"
|
||||
@ -135,13 +153,7 @@ export default function SidebarWrapper<T extends object>({
|
||||
${sidebarVisible ? "w-[250px]" : "w-[0px]"}`}
|
||||
/>
|
||||
|
||||
<div
|
||||
className={`mt-4 w-full ${
|
||||
size == "lg" ? "max-w-4xl" : "max-w-3xl"
|
||||
} mx-auto`}
|
||||
>
|
||||
{children}
|
||||
</div>
|
||||
<div className={` w-full mx-auto`}>{children}</div>
|
||||
</div>
|
||||
</div>
|
||||
<FixedLogo backgroundToggled={sidebarVisible || showDocSidebar} />
|
||||
|
@ -24,6 +24,7 @@ import {
|
||||
constructSubQuestions,
|
||||
DocumentsResponse,
|
||||
AgenticMessageResponseIDInfo,
|
||||
UserKnowledgeFilePacket,
|
||||
} from "./interfaces";
|
||||
|
||||
import Prism from "prismjs";
|
||||
@ -35,7 +36,6 @@ import {
|
||||
buildChatUrl,
|
||||
buildLatestMessageChain,
|
||||
createChatSession,
|
||||
deleteAllChatSessions,
|
||||
getCitedDocumentsFromMessage,
|
||||
getHumanAndAIMessageFromMessageNumber,
|
||||
getLastSuccessfulMessageId,
|
||||
@ -66,7 +66,6 @@ import {
|
||||
} from "react";
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { SEARCH_PARAM_NAMES, shouldSubmitOnLoad } from "./searchParams";
|
||||
import { useDocumentSelection } from "./useDocumentSelection";
|
||||
import { LlmDescriptor, useFilters, useLlmManager } from "@/lib/hooks";
|
||||
import { ChatState, FeedbackType, RegenerationState } from "./types";
|
||||
import { DocumentResults } from "./documentSidebar/DocumentResults";
|
||||
@ -87,6 +86,7 @@ import {
|
||||
SubQuestionPiece,
|
||||
AgentAnswerPiece,
|
||||
RefinedAnswerImprovement,
|
||||
MinimalOnyxDocument,
|
||||
} from "@/lib/search/interfaces";
|
||||
import { buildFilters } from "@/lib/search/utils";
|
||||
import { SettingsContext } from "@/components/settings/SettingsProvider";
|
||||
@ -100,14 +100,13 @@ import { ChatInputBar } from "./input/ChatInputBar";
|
||||
import { useChatContext } from "@/components/context/ChatContext";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { ChatPopup } from "./ChatPopup";
|
||||
|
||||
import FunctionalHeader from "@/components/chat/Header";
|
||||
import { useSidebarVisibility } from "@/components/chat/hooks";
|
||||
import {
|
||||
PRO_SEARCH_TOGGLED_COOKIE_NAME,
|
||||
SIDEBAR_TOGGLED_COOKIE_NAME,
|
||||
} from "@/components/resizable/constants";
|
||||
import FixedLogo from "../../components/logo/FixedLogo";
|
||||
import FixedLogo from "@/components/logo/FixedLogo";
|
||||
|
||||
import ExceptionTraceModal from "@/components/modals/ExceptionTraceModal";
|
||||
|
||||
@ -134,6 +133,16 @@ import { UserSettingsModal } from "./modal/UserSettingsModal";
|
||||
import { AgenticMessage } from "./message/AgenticMessage";
|
||||
import AssistantModal from "../assistants/mine/AssistantModal";
|
||||
import { useSidebarShortcut } from "@/lib/browserUtilities";
|
||||
import { FilePickerModal } from "./my-documents/components/FilePicker";
|
||||
|
||||
import { SourceMetadata } from "@/lib/search/interfaces";
|
||||
import { ValidSources } from "@/lib/types";
|
||||
import {
|
||||
FileUploadResponse,
|
||||
FileResponse,
|
||||
FolderResponse,
|
||||
useDocumentsContext,
|
||||
} from "./my-documents/DocumentsContext";
|
||||
import { ChatSearchModal } from "./chat_search/ChatSearchModal";
|
||||
import { ErrorBanner } from "./message/Resubmit";
|
||||
import MinimalMarkdown from "@/components/chat/MinimalMarkdown";
|
||||
@ -147,11 +156,15 @@ export function ChatPage({
|
||||
documentSidebarInitialWidth,
|
||||
sidebarVisible,
|
||||
firstMessage,
|
||||
initialFolders,
|
||||
initialFiles,
|
||||
}: {
|
||||
toggle: (toggled?: boolean) => void;
|
||||
documentSidebarInitialWidth?: number;
|
||||
sidebarVisible: boolean;
|
||||
firstMessage?: string;
|
||||
initialFolders?: any;
|
||||
initialFiles?: any;
|
||||
}) {
|
||||
const router = useRouter();
|
||||
const searchParams = useSearchParams();
|
||||
@ -168,11 +181,27 @@ export function ChatPage({
|
||||
proSearchToggled,
|
||||
} = useChatContext();
|
||||
|
||||
const {
|
||||
selectedFiles,
|
||||
selectedFolders,
|
||||
addSelectedFile,
|
||||
addSelectedFolder,
|
||||
removeSelectedFolder,
|
||||
clearSelectedItems,
|
||||
folders: userFolders,
|
||||
files: allUserFiles,
|
||||
uploadFile,
|
||||
removeSelectedFile,
|
||||
currentMessageFiles,
|
||||
setCurrentMessageFiles,
|
||||
} = useDocumentsContext();
|
||||
|
||||
const defaultAssistantIdRaw = searchParams.get(SEARCH_PARAM_NAMES.PERSONA_ID);
|
||||
const defaultAssistantId = defaultAssistantIdRaw
|
||||
? parseInt(defaultAssistantIdRaw)
|
||||
: undefined;
|
||||
|
||||
// Function declarations need to be outside of blocks in strict mode
|
||||
function useScreenSize() {
|
||||
const [screenSize, setScreenSize] = useState({
|
||||
width: typeof window !== "undefined" ? window.innerWidth : 0,
|
||||
@ -201,6 +230,8 @@ export function ChatPage({
|
||||
const settings = useContext(SettingsContext);
|
||||
const enterpriseSettings = settings?.enterpriseSettings;
|
||||
|
||||
const [viewingFilePicker, setViewingFilePicker] = useState(false);
|
||||
const [toggleDocSelection, setToggleDocSelection] = useState(false);
|
||||
const [documentSidebarVisible, setDocumentSidebarVisible] = useState(false);
|
||||
const [proSearchEnabled, setProSearchEnabled] = useState(proSearchToggled);
|
||||
const toggleProSearch = () => {
|
||||
@ -297,16 +328,6 @@ export function ChatPage({
|
||||
SEARCH_PARAM_NAMES.TEMPERATURE
|
||||
);
|
||||
|
||||
const defaultTemperature = search_param_temperature
|
||||
? parseFloat(search_param_temperature)
|
||||
: selectedAssistant?.tools.some(
|
||||
(tool) =>
|
||||
tool.in_code_tool_id === SEARCH_TOOL_ID ||
|
||||
tool.in_code_tool_id === INTERNET_SEARCH_TOOL_ID
|
||||
)
|
||||
? 0
|
||||
: 0.7;
|
||||
|
||||
const setSelectedAssistantFromId = (assistantId: number) => {
|
||||
// NOTE: also intentionally look through available assistants here, so that
|
||||
// even if the user has hidden an assistant they can still go back to it
|
||||
@ -320,7 +341,7 @@ export function ChatPage({
|
||||
useState<Persona | null>(null);
|
||||
|
||||
const [presentingDocument, setPresentingDocument] =
|
||||
useState<OnyxDocument | null>(null);
|
||||
useState<MinimalOnyxDocument | null>(null);
|
||||
|
||||
// Current assistant is decided based on this ordering
|
||||
// 1. Alternative assistant (assistant selected explicitly by user)
|
||||
@ -350,9 +371,14 @@ export function ChatPage({
|
||||
|
||||
const noAssistants = liveAssistant == null || liveAssistant == undefined;
|
||||
|
||||
const availableSources = ccPairs.map((ccPair) => ccPair.source);
|
||||
const uniqueSources = Array.from(new Set(availableSources));
|
||||
const sources = uniqueSources.map((source) => getSourceMetadata(source));
|
||||
const availableSources: ValidSources[] = useMemo(() => {
|
||||
return ccPairs.map((ccPair) => ccPair.source);
|
||||
}, [ccPairs]);
|
||||
|
||||
const sources: SourceMetadata[] = useMemo(() => {
|
||||
const uniqueSources = Array.from(new Set(availableSources));
|
||||
return uniqueSources.map((source) => getSourceMetadata(source));
|
||||
}, [availableSources]);
|
||||
|
||||
const stopGenerating = () => {
|
||||
const currentSession = currentSessionId();
|
||||
@ -426,7 +452,6 @@ export function ChatPage({
|
||||
const isChatSessionSwitch = existingChatSessionId !== priorChatSessionId;
|
||||
if (isChatSessionSwitch) {
|
||||
// de-select documents
|
||||
clearSelectedDocuments();
|
||||
|
||||
// reset all filters
|
||||
filterManager.setSelectedDocumentSets([]);
|
||||
@ -440,6 +465,7 @@ export function ChatPage({
|
||||
// if switching from one chat to another, then need to scroll again
|
||||
// if we're creating a brand new chat, then don't need to scroll
|
||||
if (chatSessionIdRef.current !== null) {
|
||||
clearSelectedDocuments();
|
||||
setHasPerformedInitialScroll(false);
|
||||
}
|
||||
}
|
||||
@ -466,7 +492,6 @@ export function ChatPage({
|
||||
return;
|
||||
}
|
||||
|
||||
clearSelectedDocuments();
|
||||
setIsFetchingChatMessages(true);
|
||||
const response = await fetch(
|
||||
`/api/chat/get-chat-session/${existingChatSessionId}`
|
||||
@ -549,6 +574,37 @@ export function ChatPage({
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [existingChatSessionId, searchParams.get(SEARCH_PARAM_NAMES.PERSONA_ID)]);
|
||||
|
||||
useEffect(() => {
|
||||
const userFolderId = searchParams.get(SEARCH_PARAM_NAMES.USER_FOLDER_ID);
|
||||
const allMyDocuments = searchParams.get(
|
||||
SEARCH_PARAM_NAMES.ALL_MY_DOCUMENTS
|
||||
);
|
||||
|
||||
if (userFolderId) {
|
||||
const userFolder = userFolders.find(
|
||||
(folder) => folder.id === parseInt(userFolderId)
|
||||
);
|
||||
if (userFolder) {
|
||||
addSelectedFolder(userFolder);
|
||||
}
|
||||
} else if (allMyDocuments === "true" || allMyDocuments === "1") {
|
||||
// Clear any previously selected folders
|
||||
|
||||
clearSelectedItems();
|
||||
|
||||
// Add all user folders to the current context
|
||||
userFolders.forEach((folder) => {
|
||||
addSelectedFolder(folder);
|
||||
});
|
||||
}
|
||||
}, [
|
||||
userFolders,
|
||||
searchParams.get(SEARCH_PARAM_NAMES.USER_FOLDER_ID),
|
||||
searchParams.get(SEARCH_PARAM_NAMES.ALL_MY_DOCUMENTS),
|
||||
addSelectedFolder,
|
||||
clearSelectedItems,
|
||||
]);
|
||||
|
||||
const [message, setMessage] = useState(
|
||||
searchParams.get(SEARCH_PARAM_NAMES.USER_PROMPT) || ""
|
||||
);
|
||||
@ -793,22 +849,17 @@ export function ChatPage({
|
||||
const currentSessionChatState = currentChatState();
|
||||
const currentSessionRegenerationState = currentRegenerationState();
|
||||
|
||||
// uploaded files
|
||||
const [currentMessageFiles, setCurrentMessageFiles] = useState<
|
||||
FileDescriptor[]
|
||||
>([]);
|
||||
|
||||
// for document display
|
||||
// NOTE: -1 is a special designation that means the latest AI message
|
||||
const [selectedMessageForDocDisplay, setSelectedMessageForDocDisplay] =
|
||||
useState<number | null>(null);
|
||||
|
||||
const { aiMessage } = selectedMessageForDocDisplay
|
||||
const { aiMessage, humanMessage } = selectedMessageForDocDisplay
|
||||
? getHumanAndAIMessageFromMessageNumber(
|
||||
messageHistory,
|
||||
selectedMessageForDocDisplay
|
||||
)
|
||||
: { aiMessage: null };
|
||||
: { aiMessage: null, humanMessage: null };
|
||||
|
||||
const [chatSessionSharedStatus, setChatSessionSharedStatus] =
|
||||
useState<ChatSessionSharedStatus>(ChatSessionSharedStatus.Private);
|
||||
@ -834,13 +885,6 @@ export function ChatPage({
|
||||
);
|
||||
}
|
||||
}, [submittedMessage, currentSessionChatState]);
|
||||
|
||||
const [
|
||||
selectedDocuments,
|
||||
toggleDocumentSelection,
|
||||
clearSelectedDocuments,
|
||||
selectedDocumentTokens,
|
||||
] = useDocumentSelection();
|
||||
// just choose a conservative default, this will be updated in the
|
||||
// background on initial load / on persona change
|
||||
const [maxTokens, setMaxTokens] = useState<number>(4096);
|
||||
@ -1310,6 +1354,7 @@ export function ChatPage({
|
||||
let includeAgentic = false;
|
||||
let secondLevelMessageId: number | null = null;
|
||||
let isAgentic: boolean = false;
|
||||
let files: FileDescriptor[] = [];
|
||||
|
||||
let initialFetchDetails: null | {
|
||||
user_message_id: number;
|
||||
@ -1341,7 +1386,9 @@ export function ChatPage({
|
||||
filterManager.selectedSources,
|
||||
filterManager.selectedDocumentSets,
|
||||
filterManager.timeRange,
|
||||
filterManager.selectedTags
|
||||
filterManager.selectedTags,
|
||||
selectedFiles.map((file) => file.id),
|
||||
selectedFolders.map((folder) => folder.id)
|
||||
),
|
||||
selectedDocumentIds: selectedDocuments
|
||||
.filter(
|
||||
@ -1351,6 +1398,11 @@ export function ChatPage({
|
||||
.map((document) => document.db_doc_id as number),
|
||||
queryOverride,
|
||||
forceSearch,
|
||||
userFolderIds: selectedFolders.map((folder) => folder.id),
|
||||
userFileIds: selectedFiles
|
||||
.filter((file) => file.id !== undefined && file.id !== null)
|
||||
.map((file) => file.id),
|
||||
|
||||
regenerate: regenerationRequest !== undefined,
|
||||
modelProvider:
|
||||
modelOverride?.name || llmManager.currentLlm.name || undefined,
|
||||
@ -1414,7 +1466,7 @@ export function ChatPage({
|
||||
: user_message_id,
|
||||
message: currMessage,
|
||||
type: "user",
|
||||
files: currentMessageFiles,
|
||||
files: files,
|
||||
toolCall: null,
|
||||
parentMessageId: parentMessage?.messageId || SYSTEM_MESSAGE_ID,
|
||||
},
|
||||
@ -1473,6 +1525,15 @@ export function ChatPage({
|
||||
second_level_generating = true;
|
||||
}
|
||||
}
|
||||
if (Object.hasOwn(packet, "user_files")) {
|
||||
const userFiles = (packet as UserKnowledgeFilePacket).user_files;
|
||||
// Ensure files are unique by id
|
||||
const newUserFiles = userFiles.filter(
|
||||
(newFile) =>
|
||||
!files.some((existingFile) => existingFile.id === newFile.id)
|
||||
);
|
||||
files = files.concat(newUserFiles);
|
||||
}
|
||||
if (Object.hasOwn(packet, "is_agentic")) {
|
||||
isAgentic = (packet as any).is_agentic;
|
||||
}
|
||||
@ -1676,7 +1737,7 @@ export function ChatPage({
|
||||
: initialFetchDetails.user_message_id!,
|
||||
message: currMessage,
|
||||
type: "user",
|
||||
files: currentMessageFiles,
|
||||
files: files,
|
||||
toolCall: null,
|
||||
parentMessageId: error ? null : lastSuccessfulMessageId,
|
||||
childrenMessageIds: [
|
||||
@ -1853,38 +1914,18 @@ export function ChatPage({
|
||||
return;
|
||||
}
|
||||
|
||||
const tempFileDescriptors = acceptedFiles.map((file) => ({
|
||||
id: uuidv4(),
|
||||
type: file.type.startsWith("image/")
|
||||
? ChatFileType.IMAGE
|
||||
: ChatFileType.DOCUMENT,
|
||||
isUploading: true,
|
||||
}));
|
||||
|
||||
// only show loading spinner for reasonably large files
|
||||
const totalSize = acceptedFiles.reduce((sum, file) => sum + file.size, 0);
|
||||
if (totalSize > 50 * 1024) {
|
||||
setCurrentMessageFiles((prev) => [...prev, ...tempFileDescriptors]);
|
||||
}
|
||||
|
||||
const removeTempFiles = (prev: FileDescriptor[]) => {
|
||||
return prev.filter(
|
||||
(file) => !tempFileDescriptors.some((newFile) => newFile.id === file.id)
|
||||
);
|
||||
};
|
||||
updateChatState("uploading", currentSessionId());
|
||||
|
||||
await uploadFilesForChat(acceptedFiles).then(([files, error]) => {
|
||||
if (error) {
|
||||
setCurrentMessageFiles((prev) => removeTempFiles(prev));
|
||||
setPopup({
|
||||
type: "error",
|
||||
message: error,
|
||||
});
|
||||
} else {
|
||||
setCurrentMessageFiles((prev) => [...removeTempFiles(prev), ...files]);
|
||||
}
|
||||
});
|
||||
const [uploadedFiles, error] = await uploadFilesForChat(acceptedFiles);
|
||||
if (error) {
|
||||
setPopup({
|
||||
type: "error",
|
||||
message: error,
|
||||
});
|
||||
}
|
||||
|
||||
setCurrentMessageFiles((prev) => [...prev, ...uploadedFiles]);
|
||||
|
||||
updateChatState("input", currentSessionId());
|
||||
};
|
||||
|
||||
@ -1948,7 +1989,10 @@ export function ChatPage({
|
||||
useEffect(() => {
|
||||
if (liveAssistant) {
|
||||
const hasSearchTool = liveAssistant.tools.some(
|
||||
(tool) => tool.in_code_tool_id === SEARCH_TOOL_ID
|
||||
(tool) =>
|
||||
tool.in_code_tool_id === SEARCH_TOOL_ID &&
|
||||
liveAssistant.user_file_ids?.length == 0 &&
|
||||
liveAssistant.user_folder_ids?.length == 0
|
||||
);
|
||||
setRetrievalEnabled(hasSearchTool);
|
||||
if (!hasSearchTool) {
|
||||
@ -1960,7 +2004,10 @@ export function ChatPage({
|
||||
const [retrievalEnabled, setRetrievalEnabled] = useState(() => {
|
||||
if (liveAssistant) {
|
||||
return liveAssistant.tools.some(
|
||||
(tool) => tool.in_code_tool_id === SEARCH_TOOL_ID
|
||||
(tool) =>
|
||||
tool.in_code_tool_id === SEARCH_TOOL_ID &&
|
||||
liveAssistant.user_file_ids?.length == 0 &&
|
||||
liveAssistant.user_folder_ids?.length == 0
|
||||
);
|
||||
}
|
||||
return false;
|
||||
@ -1978,6 +2025,12 @@ export function ChatPage({
|
||||
|
||||
const innerSidebarElementRef = useRef<HTMLDivElement>(null);
|
||||
const [settingsToggled, setSettingsToggled] = useState(false);
|
||||
|
||||
const [selectedDocuments, setSelectedDocuments] = useState<OnyxDocument[]>(
|
||||
[]
|
||||
);
|
||||
const [selectedDocumentTokens, setSelectedDocumentTokens] = useState(0);
|
||||
|
||||
const currentPersona = alternativeAssistant || liveAssistant;
|
||||
|
||||
const HORIZON_DISTANCE = 800;
|
||||
@ -2054,6 +2107,42 @@ export function ChatPage({
|
||||
useEffect(() => {
|
||||
abortControllersRef.current = abortControllers;
|
||||
}, [abortControllers]);
|
||||
useEffect(() => {
|
||||
const calculateTokensAndUpdateSearchMode = async () => {
|
||||
if (selectedFiles.length > 0 || selectedFolders.length > 0) {
|
||||
try {
|
||||
// Prepare the query parameters for the API call
|
||||
const fileIds = selectedFiles.map((file: FileResponse) => file.id);
|
||||
const folderIds = selectedFolders.map(
|
||||
(folder: FolderResponse) => folder.id
|
||||
);
|
||||
|
||||
// Build the query string
|
||||
const queryParams = new URLSearchParams();
|
||||
fileIds.forEach((id) =>
|
||||
queryParams.append("file_ids", id.toString())
|
||||
);
|
||||
folderIds.forEach((id) =>
|
||||
queryParams.append("folder_ids", id.toString())
|
||||
);
|
||||
|
||||
// Make the API call to get token estimate
|
||||
const response = await fetch(
|
||||
`/api/user/file/token-estimate?${queryParams.toString()}`
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
console.error("Failed to fetch token estimate");
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error calculating tokens:", error);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
calculateTokensAndUpdateSearchMode();
|
||||
}, [selectedFiles, selectedFolders, llmManager.currentLlm]);
|
||||
|
||||
useSidebarShortcut(router, toggleSidebar);
|
||||
|
||||
@ -2073,6 +2162,7 @@ export function ChatPage({
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// We call onSubmit, passing a `messageOverride`
|
||||
onSubmit({
|
||||
messageIdToResend: lastUserMsg.messageId,
|
||||
@ -2122,6 +2212,20 @@ export function ChatPage({
|
||||
</>
|
||||
);
|
||||
|
||||
const clearSelectedDocuments = () => {
|
||||
setSelectedDocuments([]);
|
||||
setSelectedDocumentTokens(0);
|
||||
clearSelectedItems();
|
||||
};
|
||||
|
||||
const toggleDocumentSelection = (document: OnyxDocument) => {
|
||||
setSelectedDocuments((prev) =>
|
||||
prev.some((d) => d.document_id === document.document_id)
|
||||
? prev.filter((d) => d.document_id !== document.document_id)
|
||||
: [...prev, document]
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
<>
|
||||
<HealthCheckBanner />
|
||||
@ -2168,6 +2272,18 @@ export function ChatPage({
|
||||
/>
|
||||
)}
|
||||
|
||||
{toggleDocSelection && (
|
||||
<FilePickerModal
|
||||
setPresentingDocument={setPresentingDocument}
|
||||
buttonContent="Set as Context"
|
||||
isOpen={true}
|
||||
onClose={() => setToggleDocSelection(false)}
|
||||
onSave={() => {
|
||||
setToggleDocSelection(false);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
|
||||
<ChatSearchModal
|
||||
open={isChatSearchModalOpen}
|
||||
onCloseModal={() => setIsChatSearchModalOpen(false)}
|
||||
@ -2189,6 +2305,7 @@ export function ChatPage({
|
||||
? true
|
||||
: false
|
||||
}
|
||||
humanMessage={humanMessage}
|
||||
setPresentingDocument={setPresentingDocument}
|
||||
modal={true}
|
||||
ref={innerSidebarElementRef}
|
||||
@ -2344,6 +2461,7 @@ export function ChatPage({
|
||||
`}
|
||||
>
|
||||
<DocumentResults
|
||||
humanMessage={humanMessage}
|
||||
agenticMessage={
|
||||
aiMessage?.sub_questions?.length! > 0 ||
|
||||
messageHistory.find(
|
||||
@ -2527,6 +2645,9 @@ export function ChatPage({
|
||||
key={messageReactComponentKey}
|
||||
>
|
||||
<HumanMessage
|
||||
setPresentingDocument={
|
||||
setPresentingDocument
|
||||
}
|
||||
disableSwitchingForStreaming={
|
||||
(nextMessage &&
|
||||
nextMessage.is_generating) ||
|
||||
@ -2624,6 +2745,18 @@ export function ChatPage({
|
||||
? messageHistory[i + 1]
|
||||
: undefined;
|
||||
|
||||
const attachedFileDescriptors =
|
||||
previousMessage?.files.filter(
|
||||
(file) =>
|
||||
file.type == ChatFileType.USER_KNOWLEDGE
|
||||
);
|
||||
const userFiles = allUserFiles?.filter((file) =>
|
||||
attachedFileDescriptors?.some(
|
||||
(descriptor) =>
|
||||
descriptor.id === file.file_id
|
||||
)
|
||||
);
|
||||
|
||||
return (
|
||||
<div
|
||||
className="text-text"
|
||||
@ -2812,6 +2945,7 @@ export function ChatPage({
|
||||
/>
|
||||
) : (
|
||||
<AIMessage
|
||||
userKnowledgeFiles={userFiles}
|
||||
docs={
|
||||
message?.documents &&
|
||||
message?.documents.length > 0
|
||||
@ -3010,6 +3144,7 @@ export function ChatPage({
|
||||
messageHistory[messageHistory.length - 1]
|
||||
?.type != "user")) && (
|
||||
<HumanMessage
|
||||
setPresentingDocument={setPresentingDocument}
|
||||
key={-2}
|
||||
messageId={-1}
|
||||
content={submittedMessage}
|
||||
@ -3102,21 +3237,23 @@ export function ChatPage({
|
||||
clearSelectedDocuments();
|
||||
}}
|
||||
retrievalEnabled={retrievalEnabled}
|
||||
toggleDocSelection={() =>
|
||||
setToggleDocSelection(true)
|
||||
}
|
||||
showConfigureAPIKey={() =>
|
||||
setShowApiKeyModal(true)
|
||||
}
|
||||
chatState={currentSessionChatState}
|
||||
stopGenerating={stopGenerating}
|
||||
selectedDocuments={selectedDocuments}
|
||||
// assistant stuff
|
||||
selectedAssistant={liveAssistant}
|
||||
setAlternativeAssistant={setAlternativeAssistant}
|
||||
alternativeAssistant={alternativeAssistant}
|
||||
// end assistant stuff
|
||||
message={message}
|
||||
setMessage={setMessage}
|
||||
stopGenerating={stopGenerating}
|
||||
onSubmit={onSubmit}
|
||||
files={currentMessageFiles}
|
||||
chatState={currentSessionChatState}
|
||||
alternativeAssistant={alternativeAssistant}
|
||||
selectedAssistant={
|
||||
selectedAssistant || liveAssistant
|
||||
}
|
||||
setAlternativeAssistant={setAlternativeAssistant}
|
||||
setFiles={setCurrentMessageFiles}
|
||||
handleFileUpload={handleImageUpload}
|
||||
textAreaRef={textAreaRef}
|
||||
@ -3188,7 +3325,6 @@ export function ChatPage({
|
||||
</div>
|
||||
<FixedLogo backgroundToggled={sidebarVisible || showHistorySidebar} />
|
||||
</div>
|
||||
{/* Right Sidebar - DocumentSidebar */}
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { SourceIcon } from "@/components/SourceIcon";
|
||||
import { OnyxDocument } from "@/lib/search/interfaces";
|
||||
import { MinimalOnyxDocument, OnyxDocument } from "@/lib/search/interfaces";
|
||||
import { FiTag } from "react-icons/fi";
|
||||
import { DocumentSelector } from "./DocumentSelector";
|
||||
import { buildDocumentSummaryDisplay } from "@/components/search/DocumentDisplay";
|
||||
@ -18,7 +18,7 @@ interface DocumentDisplayProps {
|
||||
handleSelect: (documentId: string) => void;
|
||||
tokenLimitReached: boolean;
|
||||
hideSelection?: boolean;
|
||||
setPresentingDocument: Dispatch<SetStateAction<OnyxDocument | null>>;
|
||||
setPresentingDocument: Dispatch<SetStateAction<MinimalOnyxDocument | null>>;
|
||||
}
|
||||
|
||||
export function DocumentMetadataBlock({
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { OnyxDocument } from "@/lib/search/interfaces";
|
||||
import { MinimalOnyxDocument, OnyxDocument } from "@/lib/search/interfaces";
|
||||
import { ChatDocumentDisplay } from "./ChatDocumentDisplay";
|
||||
import { removeDuplicateDocs } from "@/lib/documentUtils";
|
||||
import { Message } from "../interfaces";
|
||||
import { ChatFileType, Message } from "../interfaces";
|
||||
import {
|
||||
Dispatch,
|
||||
ForwardedRef,
|
||||
@ -11,9 +11,14 @@ import {
|
||||
useState,
|
||||
} from "react";
|
||||
import { XIcon } from "@/components/icons/icons";
|
||||
|
||||
import {
|
||||
FileSourceCard,
|
||||
FileSourceCardInResults,
|
||||
} from "../message/SourcesDisplay";
|
||||
import { useDocumentsContext } from "../my-documents/DocumentsContext";
|
||||
interface DocumentResultsProps {
|
||||
agenticMessage: boolean;
|
||||
humanMessage: Message | null;
|
||||
closeSidebar: () => void;
|
||||
selectedMessage: Message | null;
|
||||
selectedDocuments: OnyxDocument[] | null;
|
||||
@ -25,7 +30,7 @@ interface DocumentResultsProps {
|
||||
isOpen: boolean;
|
||||
isSharedChat?: boolean;
|
||||
modal: boolean;
|
||||
setPresentingDocument: Dispatch<SetStateAction<OnyxDocument | null>>;
|
||||
setPresentingDocument: Dispatch<SetStateAction<MinimalOnyxDocument | null>>;
|
||||
removeHeader?: boolean;
|
||||
}
|
||||
|
||||
@ -33,6 +38,7 @@ export const DocumentResults = forwardRef<HTMLDivElement, DocumentResultsProps>(
|
||||
(
|
||||
{
|
||||
agenticMessage,
|
||||
humanMessage,
|
||||
closeSidebar,
|
||||
modal,
|
||||
selectedMessage,
|
||||
@ -62,7 +68,14 @@ export const DocumentResults = forwardRef<HTMLDivElement, DocumentResultsProps>(
|
||||
|
||||
return () => clearTimeout(timer);
|
||||
}, [selectedDocuments]);
|
||||
const { files: allUserFiles } = useDocumentsContext();
|
||||
|
||||
const humanFileDescriptors = humanMessage?.files.filter(
|
||||
(file) => file.type == ChatFileType.USER_KNOWLEDGE
|
||||
);
|
||||
const userFiles = allUserFiles?.filter((file) =>
|
||||
humanFileDescriptors?.some((descriptor) => descriptor.id === file.file_id)
|
||||
);
|
||||
const selectedDocumentIds =
|
||||
selectedDocuments?.map((document) => document.document_id) || [];
|
||||
|
||||
@ -72,7 +85,6 @@ export const DocumentResults = forwardRef<HTMLDivElement, DocumentResultsProps>(
|
||||
const tokenLimitReached = selectedDocumentTokens > maxTokens - 75;
|
||||
|
||||
const hasSelectedDocuments = selectedDocumentIds.length > 0;
|
||||
|
||||
return (
|
||||
<>
|
||||
<div
|
||||
@ -113,7 +125,27 @@ export const DocumentResults = forwardRef<HTMLDivElement, DocumentResultsProps>(
|
||||
)}
|
||||
|
||||
<div className="overflow-y-auto h-fit mb-8 pb-8 sm:mx-0 flex-grow gap-y-0 default-scrollbar dark-scrollbar flex flex-col">
|
||||
{dedupedDocuments.length > 0 ? (
|
||||
{userFiles && userFiles.length > 0 ? (
|
||||
<div className=" gap-y-2 flex flex-col pt-2 mx-3">
|
||||
{userFiles?.map((file, index) => (
|
||||
<FileSourceCardInResults
|
||||
key={index}
|
||||
relevantDocument={dedupedDocuments.find(
|
||||
(doc) =>
|
||||
doc.document_id ===
|
||||
`FILE_CONNECTOR__${file.file_id}`
|
||||
)}
|
||||
document={file}
|
||||
setPresentingDocument={() =>
|
||||
setPresentingDocument({
|
||||
document_id: file.document_id,
|
||||
semantic_identifier: file.file_id || null,
|
||||
})
|
||||
}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
) : dedupedDocuments.length > 0 ? (
|
||||
dedupedDocuments.map((document, ind) => (
|
||||
<div
|
||||
key={document.document_id}
|
||||
@ -140,9 +172,7 @@ export const DocumentResults = forwardRef<HTMLDivElement, DocumentResultsProps>(
|
||||
/>
|
||||
</div>
|
||||
))
|
||||
) : (
|
||||
<div className="mx-3" />
|
||||
)}
|
||||
) : null}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -8,7 +8,8 @@ export async function createFolder(folderName: string): Promise<number> {
|
||||
body: JSON.stringify({ folder_name: folderName }),
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error("Failed to create folder");
|
||||
const errorData = await response.json();
|
||||
throw new Error(errorData.detail || "Failed to create folder");
|
||||
}
|
||||
const data = await response.json();
|
||||
return data;
|
||||
|
@ -27,7 +27,7 @@ import { Hoverable } from "@/components/Hoverable";
|
||||
import { ChatState } from "../types";
|
||||
import UnconfiguredProviderText from "@/components/chat/UnconfiguredProviderText";
|
||||
import { useAssistants } from "@/components/context/AssistantsContext";
|
||||
import { CalendarIcon, TagIcon, XIcon } from "lucide-react";
|
||||
import { CalendarIcon, TagIcon, XIcon, FolderIcon } from "lucide-react";
|
||||
import { FilterPopup } from "@/components/search/filtering/FilterPopup";
|
||||
import { DocumentSet, Tag } from "@/lib/types";
|
||||
import { SourceIcon } from "@/components/SourceIcon";
|
||||
@ -35,11 +35,13 @@ import { getFormattedDateRangeString } from "@/lib/dateUtils";
|
||||
import { truncateString } from "@/lib/utils";
|
||||
import { buildImgUrl } from "../files/images/utils";
|
||||
import { useUser } from "@/components/user/UserProvider";
|
||||
import { useDocumentSelection } from "../useDocumentSelection";
|
||||
import { AgenticToggle } from "./AgenticToggle";
|
||||
import { SettingsContext } from "@/components/settings/SettingsProvider";
|
||||
import { LoadingIndicator } from "react-select/dist/declarations/src/components/indicators";
|
||||
import { FidgetSpinner } from "react-loader-spinner";
|
||||
import { LoadingAnimation } from "@/components/Loading";
|
||||
import { useDocumentsContext } from "../my-documents/DocumentsContext";
|
||||
|
||||
const MAX_INPUT_HEIGHT = 200;
|
||||
export const SourceChip2 = ({
|
||||
@ -172,6 +174,7 @@ export const SourceChip = ({
|
||||
);
|
||||
|
||||
interface ChatInputBarProps {
|
||||
toggleDocSelection: () => void;
|
||||
removeDocs: () => void;
|
||||
showConfigureAPIKey: () => void;
|
||||
selectedDocuments: OnyxDocument[];
|
||||
@ -186,7 +189,6 @@ interface ChatInputBarProps {
|
||||
selectedAssistant: Persona;
|
||||
setAlternativeAssistant: (alternativeAssistant: Persona | null) => void;
|
||||
toggleDocumentSidebar: () => void;
|
||||
files: FileDescriptor[];
|
||||
setFiles: (files: FileDescriptor[]) => void;
|
||||
handleFileUpload: (files: File[]) => void;
|
||||
textAreaRef: React.RefObject<HTMLTextAreaElement>;
|
||||
@ -200,6 +202,7 @@ interface ChatInputBarProps {
|
||||
}
|
||||
|
||||
export function ChatInputBar({
|
||||
toggleDocSelection,
|
||||
retrievalEnabled,
|
||||
removeDocs,
|
||||
toggleDocumentSidebar,
|
||||
@ -216,7 +219,6 @@ export function ChatInputBar({
|
||||
selectedAssistant,
|
||||
setAlternativeAssistant,
|
||||
|
||||
files,
|
||||
setFiles,
|
||||
handleFileUpload,
|
||||
textAreaRef,
|
||||
@ -229,6 +231,15 @@ export function ChatInputBar({
|
||||
setProSearchEnabled,
|
||||
}: ChatInputBarProps) {
|
||||
const { user } = useUser();
|
||||
const {
|
||||
selectedFiles,
|
||||
selectedFolders,
|
||||
removeSelectedFile,
|
||||
removeSelectedFolder,
|
||||
currentMessageFiles,
|
||||
setCurrentMessageFiles,
|
||||
} = useDocumentsContext();
|
||||
|
||||
const settings = useContext(SettingsContext);
|
||||
useEffect(() => {
|
||||
const textarea = textAreaRef.current;
|
||||
@ -628,7 +639,9 @@ export function ChatInputBar({
|
||||
/>
|
||||
|
||||
{(selectedDocuments.length > 0 ||
|
||||
files.length > 0 ||
|
||||
selectedFiles.length > 0 ||
|
||||
selectedFolders.length > 0 ||
|
||||
currentMessageFiles.length > 0 ||
|
||||
filterManager.timeRange ||
|
||||
filterManager.selectedDocumentSets.length > 0 ||
|
||||
filterManager.selectedTags.length > 0 ||
|
||||
@ -651,6 +664,22 @@ export function ChatInputBar({
|
||||
/>
|
||||
))}
|
||||
|
||||
{selectedFiles.map((file) => (
|
||||
<SourceChip
|
||||
key={file.id}
|
||||
icon={<FileIcon size={16} />}
|
||||
title={file.name}
|
||||
onRemove={() => removeSelectedFile(file)}
|
||||
/>
|
||||
))}
|
||||
{selectedFolders.map((folder) => (
|
||||
<SourceChip
|
||||
key={folder.id}
|
||||
icon={<FolderIcon size={16} />}
|
||||
title={folder.name}
|
||||
onRemove={() => removeSelectedFolder(folder)}
|
||||
/>
|
||||
))}
|
||||
{filterManager.timeRange && (
|
||||
<SourceChip
|
||||
truncateTitle={false}
|
||||
@ -680,7 +709,6 @@ export function ChatInputBar({
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
|
||||
{filterManager.selectedSources.length > 0 &&
|
||||
filterManager.selectedSources.map((source, index) => (
|
||||
<SourceChip
|
||||
@ -701,7 +729,6 @@ export function ChatInputBar({
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
|
||||
{selectedDocuments.length > 0 && (
|
||||
<SourceChip
|
||||
key="selected-documents"
|
||||
@ -713,8 +740,7 @@ export function ChatInputBar({
|
||||
onRemove={removeDocs}
|
||||
/>
|
||||
)}
|
||||
|
||||
{files.map((file, index) =>
|
||||
{currentMessageFiles.map((file, index) =>
|
||||
file.type === ChatFileType.IMAGE ? (
|
||||
<SourceChip
|
||||
key={`file-${index}`}
|
||||
@ -730,8 +756,8 @@ export function ChatInputBar({
|
||||
}
|
||||
title={file.name || "File" + file.id}
|
||||
onRemove={() => {
|
||||
setFiles(
|
||||
files.filter(
|
||||
setCurrentMessageFiles(
|
||||
currentMessageFiles.filter(
|
||||
(fileInFilter) => fileInFilter.id !== file.id
|
||||
)
|
||||
);
|
||||
@ -743,8 +769,8 @@ export function ChatInputBar({
|
||||
icon={<FileIcon className="text-red-500" size={16} />}
|
||||
title={file.name || "File"}
|
||||
onRemove={() => {
|
||||
setFiles(
|
||||
files.filter(
|
||||
setCurrentMessageFiles(
|
||||
currentMessageFiles.filter(
|
||||
(fileInFilter) => fileInFilter.id !== file.id
|
||||
)
|
||||
);
|
||||
@ -763,20 +789,9 @@ export function ChatInputBar({
|
||||
name="File"
|
||||
Icon={FiPlusCircle}
|
||||
onClick={() => {
|
||||
const input = document.createElement("input");
|
||||
input.type = "file";
|
||||
input.multiple = true;
|
||||
input.onchange = (event: any) => {
|
||||
const files = Array.from(
|
||||
event?.target?.files || []
|
||||
) as File[];
|
||||
if (files.length > 0) {
|
||||
handleFileUpload(files);
|
||||
}
|
||||
};
|
||||
input.click();
|
||||
toggleDocSelection();
|
||||
}}
|
||||
tooltipContent={"Upload files"}
|
||||
tooltipContent={"Upload files and attach user files"}
|
||||
/>
|
||||
|
||||
<LLMPopover
|
||||
|