rkuo-danswer 4fe99d05fd
add timings for syncing (#3798)
* add timings for syncing

* add more logging

* more debugging

* refactor multipass/db check out of VespaIndex

* circular imports?

* more debugging

* add logs

* various improvements

* additional logs to narrow down issue

* use global httpx pool for the main vespa flows in celery. Use in more places eventually.

* cleanup debug logging, etc

* remove debug logging

* this should use the secondary index

* mypy

* missed some logging

* review fixes

* refactor get_default_document_index to use search settings

* more missed logging

* fix circular refs

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
Co-authored-by: pablodanswer <pablo@danswer.ai>
2025-01-29 23:24:44 +00:00

107 lines
3.6 KiB
Python

from fastapi import APIRouter
from fastapi import Depends
from fastapi import HTTPException
from fastapi import Query
from sqlalchemy.orm import Session
from onyx.auth.users import current_user
from onyx.context.search.models import IndexFilters
from onyx.context.search.preprocessing.access_filters import (
build_access_filters_for_user,
)
from onyx.db.engine import get_session
from onyx.db.models import User
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.natural_language_processing.utils import get_tokenizer
from onyx.prompts.prompt_utils import build_doc_context_str
from onyx.server.documents.models import ChunkInfo
from onyx.server.documents.models import DocumentInfo
router = APIRouter(prefix="/document")
# Have to use a query parameter as FastAPI is interpreting the URL type document_ids
# as a different path
@router.get("/document-size-info")
def get_document_info(
document_id: str = Query(...),
user: User | None = Depends(current_user),
db_session: Session = Depends(get_session),
) -> DocumentInfo:
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(search_settings, None)
user_acl_filters = build_access_filters_for_user(user, db_session)
inference_chunks = document_index.id_based_retrieval(
chunk_requests=[VespaChunkRequest(document_id=document_id)],
filters=IndexFilters(access_control_list=user_acl_filters),
)
if not inference_chunks:
raise HTTPException(status_code=404, detail="Document not found")
contents = [chunk.content for chunk in inference_chunks]
combined_contents = "\n".join(contents)
# get actual document context used for LLM
first_chunk = inference_chunks[0]
tokenizer_encode = get_tokenizer(
provider_type=search_settings.provider_type,
model_name=search_settings.model_name,
).encode
full_context_str = build_doc_context_str(
semantic_identifier=first_chunk.semantic_identifier,
source_type=first_chunk.source_type,
content=combined_contents,
metadata_dict=first_chunk.metadata,
updated_at=first_chunk.updated_at,
ind=0,
)
return DocumentInfo(
num_chunks=len(inference_chunks),
num_tokens=len(tokenizer_encode(full_context_str)),
)
@router.get("/chunk-info")
def get_chunk_info(
document_id: str = Query(...),
chunk_id: int = Query(...),
user: User | None = Depends(current_user),
db_session: Session = Depends(get_session),
) -> ChunkInfo:
search_settings = get_current_search_settings(db_session)
document_index = get_default_document_index(search_settings, None)
user_acl_filters = build_access_filters_for_user(user, db_session)
chunk_request = VespaChunkRequest(
document_id=document_id,
min_chunk_ind=chunk_id,
max_chunk_ind=chunk_id,
)
inference_chunks = document_index.id_based_retrieval(
chunk_requests=[chunk_request],
filters=IndexFilters(access_control_list=user_acl_filters),
batch_retrieval=True,
)
if not inference_chunks:
raise HTTPException(status_code=404, detail="Chunk not found")
chunk_content = inference_chunks[0].content
tokenizer_encode = get_tokenizer(
provider_type=search_settings.provider_type,
model_name=search_settings.model_name,
).encode
return ChunkInfo(
content=chunk_content, num_tokens=len(tokenizer_encode(chunk_content))
)