add user files (#4152)

This commit is contained in:
pablonyx
2025-03-31 14:06:59 -07:00
committed by Weves
parent ccd372cc4a
commit 3a3b2a2f8d
166 changed files with 12892 additions and 1048 deletions

View File

@@ -5,17 +5,19 @@ Usage:
python vespa_debug_tool.py --action <action> [options]
Actions:
config : Print Vespa configuration
connect : Check Vespa connectivity
list_docs : List documents
search : Search documents
update : Update a document
delete : Delete a document
get_acls : Get document ACLs
config : Print Vespa configuration
connect : Check Vespa connectivity
list_docs : List documents
list_connector : List documents for a specific connector-credential pair
search : Search documents
update : Update a document
delete : Delete a document
get_acls : Get document ACLs
Options:
--tenant-id : Tenant ID
--connector-id : Connector ID
--cc-pair-id : Connector-Credential Pair ID
--n : Number of documents (default 10)
--query : Search query
--doc-id : Document ID
@@ -23,6 +25,7 @@ Options:
Example:
python vespa_debug_tool.py --action list_docs --tenant-id my_tenant --connector-id 1 --n 5
python vespa_debug_tool.py --action list_connector --tenant-id my_tenant --cc-pair-id 1 --n 5
"""
import argparse
import json
@@ -59,7 +62,6 @@ from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import METADATA_LIST
from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import VESPA_APP_CONTAINER_URL
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
from onyx.utils.logger import setup_logger
@@ -108,8 +110,8 @@ def build_vespa_filters(
if not include_hidden:
filter_str += f"AND !({HIDDEN}=true) "
if filters.tenant_id and MULTI_TENANT:
filter_str += f'AND ({TENANT_ID} contains "{filters.tenant_id}") '
# if filters.tenant_id and MULTI_TENANT:
# filter_str += f'AND ({TENANT_ID} contains "{filters.tenant_id}") '
if filters.access_control_list is not None:
acl_str = _build_or_filters(ACCESS_CONTROL_LIST, filters.access_control_list)
@@ -269,8 +271,8 @@ def search_for_document(
if document_id is not None:
conditions.append(f'document_id contains "{document_id}"')
if tenant_id is not None:
conditions.append(f'tenant_id contains "{tenant_id}"')
# if tenant_id is not None:
# conditions.append(f'tenant_id contains "{tenant_id}"')
if conditions:
yql_query += " where " + " and ".join(conditions)
@@ -336,8 +338,8 @@ def list_documents(n: int = 10, tenant_id: Optional[str] = None) -> None:
# List documents from any source, filtered by tenant if provided.
logger.info(f"Listing up to {n} documents for tenant={tenant_id or 'ALL'}")
yql = "select * from sources * where true"
if tenant_id:
yql += f" and tenant_id contains '{tenant_id}'"
# if tenant_id:
# yql += f" and tenant_id contains '{tenant_id}'"
documents = query_vespa(yql, tenant_id=tenant_id, limit=n)
print(f"Total documents found: {len(documents)}")
logger.info(f"Total documents found: {len(documents)}")
@@ -444,12 +446,15 @@ def get_document_acls(
response = vespa_client.get(document_url)
if response.status_code == 200:
fields = response.json().get("fields", {})
document_id = fields.get("document_id") or fields.get(
"documentid", "Unknown"
)
acls = fields.get("access_control_list", {})
title = fields.get("title", "")
source_type = fields.get("source_type", "")
doc_sets = fields.get("document_sets", [])
user_file = fields.get("user_file", None)
source_links_raw = fields.get("source_links", "{}")
try:
source_links = json.loads(source_links_raw)
@@ -462,6 +467,8 @@ def get_document_acls(
print(f"Source Links: {source_links}")
print(f"Title: {title}")
print(f"Source Type: {source_type}")
print(f"Document Sets: {doc_sets}")
print(f"User File: {user_file}")
if MULTI_TENANT:
print(f"Tenant ID: {fields.get('tenant_id', 'N/A')}")
print("-" * 80)
@@ -576,6 +583,90 @@ class VespaDebugging:
# List documents for a tenant.
list_documents(n, self.tenant_id)
def list_connector(self, cc_pair_id: int, n: int = 10) -> None:
# List documents for a specific connector-credential pair in the tenant
logger.info(
f"Listing documents for tenant={self.tenant_id}, cc_pair_id={cc_pair_id}"
)
# Get document IDs for this connector-credential pair
with get_session_with_tenant(tenant_id=self.tenant_id) as session:
# First get the connector_id from the cc_pair_id
cc_pair = (
session.query(ConnectorCredentialPair)
.filter(ConnectorCredentialPair.id == cc_pair_id)
.first()
)
if not cc_pair:
print(f"No connector-credential pair found with ID {cc_pair_id}")
return
connector_id = cc_pair.connector_id
# Now get document IDs for this connector
doc_ids_data = (
session.query(DocumentByConnectorCredentialPair.id)
.filter(DocumentByConnectorCredentialPair.connector_id == connector_id)
.distinct()
.all()
)
doc_ids = [doc_id[0] for doc_id in doc_ids_data]
if not doc_ids:
print(f"No documents found for connector-credential pair ID {cc_pair_id}")
return
print(
f"Found {len(doc_ids)} documents for connector-credential pair ID {cc_pair_id}"
)
# Limit to the first n document IDs
target_doc_ids = doc_ids[:n]
print(f"Retrieving details for first {len(target_doc_ids)} documents")
# Search for each document in Vespa
for doc_id in target_doc_ids:
docs = search_for_document(self.index_name, doc_id, self.tenant_id)
if not docs:
print(f"No chunks found in Vespa for document ID: {doc_id}")
continue
print(f"Document ID: {doc_id}")
print(f"Found {len(docs)} chunks in Vespa")
# Print each chunk with all fields except embeddings
for i, doc in enumerate(docs):
print(f" Chunk {i+1}:")
fields = doc.get("fields", {})
# Print all fields except embeddings
for field_name, field_value in sorted(fields.items()):
# Skip embedding fields
if "embedding" in field_name:
continue
# Format the output based on field type
if isinstance(field_value, dict) or isinstance(field_value, list):
# Truncate dictionaries and lists
truncated = (
str(field_value)[:50] + "..."
if len(str(field_value)) > 50
else str(field_value)
)
print(f" {field_name}: {truncated}")
else:
# Truncate strings and other values
str_value = str(field_value)
truncated = (
str_value[:50] + "..." if len(str_value) > 50 else str_value
)
print(f" {field_name}: {truncated}")
print("-" * 40) # Separator between chunks
print("=" * 80) # Separator between documents
def compare_chunk_count(self, document_id: str) -> tuple[int, int]:
docs = search_for_document(self.index_name, document_id, max_hits=None)
number_of_chunks_we_think_exist = get_number_of_chunks_we_think_exist(
@@ -770,6 +861,7 @@ def main() -> None:
"config",
"connect",
"list_docs",
"list_connector",
"search",
"update",
"delete",
@@ -781,6 +873,7 @@ def main() -> None:
)
parser.add_argument("--tenant-id", help="Tenant ID")
parser.add_argument("--connector-id", type=int, help="Connector ID")
parser.add_argument("--cc-pair-id", type=int, help="Connector-Credential Pair ID")
parser.add_argument(
"--n", type=int, default=10, help="Number of documents to retrieve"
)
@@ -809,6 +902,10 @@ def main() -> None:
vespa_debug.check_connectivity()
elif args.action == "list_docs":
vespa_debug.list_documents(args.n)
elif args.action == "list_connector":
if args.cc_pair_id is None:
parser.error("--cc-pair-id is required for list_connector action")
vespa_debug.list_connector(args.cc_pair_id, args.n)
elif args.action == "search":
if not args.query or args.connector_id is None:
parser.error("--query and --connector-id are required for search action")
@@ -825,9 +922,9 @@ def main() -> None:
parser.error("--doc-id and --connector-id are required for delete action")
vespa_debug.delete_document(args.connector_id, args.doc_id)
elif args.action == "get_acls":
if args.connector_id is None:
parser.error("--connector-id is required for get_acls action")
vespa_debug.acls(args.connector_id, args.n)
if args.cc_pair_id is None:
parser.error("--cc-pair-id is required for get_acls action")
vespa_debug.acls(args.cc_pair_id, args.n)
if __name__ == "__main__":

View File

@@ -72,6 +72,19 @@ def run_jobs() -> None:
"--queues=connector_indexing",
]
cmd_worker_user_files_indexing = [
"celery",
"-A",
"onyx.background.celery.versioned_apps.indexing",
"worker",
"--pool=threads",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=user_files_indexing@%n",
"--queues=user_files_indexing",
]
cmd_worker_monitoring = [
"celery",
"-A",
@@ -110,6 +123,13 @@ def run_jobs() -> None:
cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
worker_user_files_indexing_process = subprocess.Popen(
cmd_worker_user_files_indexing,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
worker_monitoring_process = subprocess.Popen(
cmd_worker_monitoring,
stdout=subprocess.PIPE,
@@ -134,6 +154,10 @@ def run_jobs() -> None:
worker_indexing_thread = threading.Thread(
target=monitor_process, args=("INDEX", worker_indexing_process)
)
worker_user_files_indexing_thread = threading.Thread(
target=monitor_process,
args=("USER_FILES_INDEX", worker_user_files_indexing_process),
)
worker_monitoring_thread = threading.Thread(
target=monitor_process, args=("MONITORING", worker_monitoring_process)
)
@@ -143,6 +167,7 @@ def run_jobs() -> None:
worker_light_thread.start()
worker_heavy_thread.start()
worker_indexing_thread.start()
worker_user_files_indexing_thread.start()
worker_monitoring_thread.start()
beat_thread.start()
@@ -150,6 +175,7 @@ def run_jobs() -> None:
worker_light_thread.join()
worker_heavy_thread.join()
worker_indexing_thread.join()
worker_user_files_indexing_thread.join()
worker_monitoring_thread.join()
beat_thread.join()

View File

@@ -93,6 +93,8 @@ def generate_dummy_chunk(
return DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=chunk,
user_file=None,
user_folder=None,
access=DocumentAccess.build(
user_emails=user_emails,
user_groups=user_groups,