2024-12-19 09:43:12 -08:00

153 lines
5.2 KiB
Python

from fastapi import APIRouter
from fastapi import Depends
from fastapi import HTTPException
from sqlalchemy.orm import Session
from onyx.auth.users import api_key_dep
from onyx.configs.constants import DEFAULT_CC_PAIR_ID
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.document import get_documents_by_cc_pair
from onyx.db.document import get_ingestion_documents
from onyx.db.engine import get_current_tenant_id
from onyx.db.engine import get_session
from onyx.db.models import User
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.document_index.factory import get_default_document_index
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
from onyx.server.onyx_api.models import DocMinimalInfo
from onyx.server.onyx_api.models import IngestionDocument
from onyx.server.onyx_api.models import IngestionResult
from onyx.utils.logger import setup_logger
logger = setup_logger()
# not using /api to avoid confusion with nginx api path routing
router = APIRouter(prefix="/onyx-api")
@router.get("/connector-docs/{cc_pair_id}")
def get_docs_by_connector_credential_pair(
cc_pair_id: int,
_: User | None = Depends(api_key_dep),
db_session: Session = Depends(get_session),
) -> list[DocMinimalInfo]:
db_docs = get_documents_by_cc_pair(cc_pair_id=cc_pair_id, db_session=db_session)
return [
DocMinimalInfo(
document_id=doc.id,
semantic_id=doc.semantic_id,
link=doc.link,
)
for doc in db_docs
]
@router.get("/ingestion")
def get_ingestion_docs(
_: User | None = Depends(api_key_dep),
db_session: Session = Depends(get_session),
) -> list[DocMinimalInfo]:
db_docs = get_ingestion_documents(db_session)
return [
DocMinimalInfo(
document_id=doc.id,
semantic_id=doc.semantic_id,
link=doc.link,
)
for doc in db_docs
]
@router.post("/ingestion")
def upsert_ingestion_doc(
doc_info: IngestionDocument,
_: User | None = Depends(api_key_dep),
db_session: Session = Depends(get_session),
tenant_id: str = Depends(get_current_tenant_id),
) -> IngestionResult:
doc_info.document.from_ingestion_api = True
document = Document.from_base(doc_info.document)
# TODO once the frontend is updated with this enum, remove this logic
if document.source == DocumentSource.INGESTION_API:
document.source = DocumentSource.FILE
cc_pair = get_connector_credential_pair_from_id(
cc_pair_id=doc_info.cc_pair_id or DEFAULT_CC_PAIR_ID, db_session=db_session
)
if cc_pair is None:
raise HTTPException(
status_code=400, detail="Connector-Credential Pair specified does not exist"
)
# Need to index for both the primary and secondary index if possible
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
curr_doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=None
)
search_settings = get_current_search_settings(db_session)
index_embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
search_settings=search_settings
)
indexing_pipeline = build_indexing_pipeline(
embedder=index_embedding_model,
document_index=curr_doc_index,
ignore_time_skip=True,
db_session=db_session,
tenant_id=tenant_id,
)
new_doc, __chunk_count = indexing_pipeline(
document_batch=[document],
index_attempt_metadata=IndexAttemptMetadata(
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
),
)
# If there's a secondary index being built, index the doc but don't use it for return here
if sec_ind_name:
sec_doc_index = get_default_document_index(
primary_index_name=curr_ind_name, secondary_index_name=None
)
sec_search_settings = get_secondary_search_settings(db_session)
if sec_search_settings is None:
# Should not ever happen
raise RuntimeError(
"Secondary index exists but no search settings configured"
)
new_index_embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
search_settings=sec_search_settings
)
sec_ind_pipeline = build_indexing_pipeline(
embedder=new_index_embedding_model,
document_index=sec_doc_index,
ignore_time_skip=True,
db_session=db_session,
tenant_id=tenant_id,
)
sec_ind_pipeline(
document_batch=[document],
index_attempt_metadata=IndexAttemptMetadata(
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
),
)
return IngestionResult(document_id=document.id, already_existed=not bool(new_doc))