mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-17 23:30:07 +02:00
* add timings for syncing * add more logging * more debugging * refactor multipass/db check out of VespaIndex * circular imports? * more debugging * add logs * various improvements * additional logs to narrow down issue * use global httpx pool for the main vespa flows in celery. Use in more places eventually. * cleanup debug logging, etc * remove debug logging * this should use the secondary index * mypy * missed some logging * review fixes * refactor get_default_document_index to use search settings * more missed logging * fix circular refs --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app> Co-authored-by: pablodanswer <pablo@danswer.ai>
107 lines
3.6 KiB
Python
107 lines
3.6 KiB
Python
from fastapi import APIRouter
|
|
from fastapi import Depends
|
|
from fastapi import HTTPException
|
|
from fastapi import Query
|
|
from sqlalchemy.orm import Session
|
|
|
|
from onyx.auth.users import current_user
|
|
from onyx.context.search.models import IndexFilters
|
|
from onyx.context.search.preprocessing.access_filters import (
|
|
build_access_filters_for_user,
|
|
)
|
|
from onyx.db.engine import get_session
|
|
from onyx.db.models import User
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.document_index.factory import get_default_document_index
|
|
from onyx.document_index.interfaces import VespaChunkRequest
|
|
from onyx.natural_language_processing.utils import get_tokenizer
|
|
from onyx.prompts.prompt_utils import build_doc_context_str
|
|
from onyx.server.documents.models import ChunkInfo
|
|
from onyx.server.documents.models import DocumentInfo
|
|
|
|
|
|
router = APIRouter(prefix="/document")
|
|
|
|
|
|
# Have to use a query parameter as FastAPI is interpreting the URL type document_ids
|
|
# as a different path
|
|
@router.get("/document-size-info")
|
|
def get_document_info(
|
|
document_id: str = Query(...),
|
|
user: User | None = Depends(current_user),
|
|
db_session: Session = Depends(get_session),
|
|
) -> DocumentInfo:
|
|
search_settings = get_current_search_settings(db_session)
|
|
document_index = get_default_document_index(search_settings, None)
|
|
|
|
user_acl_filters = build_access_filters_for_user(user, db_session)
|
|
inference_chunks = document_index.id_based_retrieval(
|
|
chunk_requests=[VespaChunkRequest(document_id=document_id)],
|
|
filters=IndexFilters(access_control_list=user_acl_filters),
|
|
)
|
|
|
|
if not inference_chunks:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
contents = [chunk.content for chunk in inference_chunks]
|
|
|
|
combined_contents = "\n".join(contents)
|
|
|
|
# get actual document context used for LLM
|
|
first_chunk = inference_chunks[0]
|
|
tokenizer_encode = get_tokenizer(
|
|
provider_type=search_settings.provider_type,
|
|
model_name=search_settings.model_name,
|
|
).encode
|
|
full_context_str = build_doc_context_str(
|
|
semantic_identifier=first_chunk.semantic_identifier,
|
|
source_type=first_chunk.source_type,
|
|
content=combined_contents,
|
|
metadata_dict=first_chunk.metadata,
|
|
updated_at=first_chunk.updated_at,
|
|
ind=0,
|
|
)
|
|
|
|
return DocumentInfo(
|
|
num_chunks=len(inference_chunks),
|
|
num_tokens=len(tokenizer_encode(full_context_str)),
|
|
)
|
|
|
|
|
|
@router.get("/chunk-info")
|
|
def get_chunk_info(
|
|
document_id: str = Query(...),
|
|
chunk_id: int = Query(...),
|
|
user: User | None = Depends(current_user),
|
|
db_session: Session = Depends(get_session),
|
|
) -> ChunkInfo:
|
|
search_settings = get_current_search_settings(db_session)
|
|
document_index = get_default_document_index(search_settings, None)
|
|
|
|
user_acl_filters = build_access_filters_for_user(user, db_session)
|
|
chunk_request = VespaChunkRequest(
|
|
document_id=document_id,
|
|
min_chunk_ind=chunk_id,
|
|
max_chunk_ind=chunk_id,
|
|
)
|
|
|
|
inference_chunks = document_index.id_based_retrieval(
|
|
chunk_requests=[chunk_request],
|
|
filters=IndexFilters(access_control_list=user_acl_filters),
|
|
batch_retrieval=True,
|
|
)
|
|
|
|
if not inference_chunks:
|
|
raise HTTPException(status_code=404, detail="Chunk not found")
|
|
|
|
chunk_content = inference_chunks[0].content
|
|
|
|
tokenizer_encode = get_tokenizer(
|
|
provider_type=search_settings.provider_type,
|
|
model_name=search_settings.model_name,
|
|
).encode
|
|
|
|
return ChunkInfo(
|
|
content=chunk_content, num_tokens=len(tokenizer_encode(chunk_content))
|
|
)
|