mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-19 16:20:05 +02:00
153 lines
5.2 KiB
Python
153 lines
5.2 KiB
Python
from fastapi import APIRouter
|
|
from fastapi import Depends
|
|
from fastapi import HTTPException
|
|
from sqlalchemy.orm import Session
|
|
|
|
from onyx.auth.users import api_key_dep
|
|
from onyx.configs.constants import DEFAULT_CC_PAIR_ID
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.models import Document
|
|
from onyx.connectors.models import IndexAttemptMetadata
|
|
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
|
from onyx.db.document import get_documents_by_cc_pair
|
|
from onyx.db.document import get_ingestion_documents
|
|
from onyx.db.engine import get_current_tenant_id
|
|
from onyx.db.engine import get_session
|
|
from onyx.db.models import User
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.db.search_settings import get_secondary_search_settings
|
|
from onyx.document_index.document_index_utils import get_both_index_names
|
|
from onyx.document_index.factory import get_default_document_index
|
|
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
|
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
|
|
from onyx.server.onyx_api.models import DocMinimalInfo
|
|
from onyx.server.onyx_api.models import IngestionDocument
|
|
from onyx.server.onyx_api.models import IngestionResult
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
# not using /api to avoid confusion with nginx api path routing
|
|
router = APIRouter(prefix="/onyx-api")
|
|
|
|
|
|
@router.get("/connector-docs/{cc_pair_id}")
|
|
def get_docs_by_connector_credential_pair(
|
|
cc_pair_id: int,
|
|
_: User | None = Depends(api_key_dep),
|
|
db_session: Session = Depends(get_session),
|
|
) -> list[DocMinimalInfo]:
|
|
db_docs = get_documents_by_cc_pair(cc_pair_id=cc_pair_id, db_session=db_session)
|
|
return [
|
|
DocMinimalInfo(
|
|
document_id=doc.id,
|
|
semantic_id=doc.semantic_id,
|
|
link=doc.link,
|
|
)
|
|
for doc in db_docs
|
|
]
|
|
|
|
|
|
@router.get("/ingestion")
|
|
def get_ingestion_docs(
|
|
_: User | None = Depends(api_key_dep),
|
|
db_session: Session = Depends(get_session),
|
|
) -> list[DocMinimalInfo]:
|
|
db_docs = get_ingestion_documents(db_session)
|
|
return [
|
|
DocMinimalInfo(
|
|
document_id=doc.id,
|
|
semantic_id=doc.semantic_id,
|
|
link=doc.link,
|
|
)
|
|
for doc in db_docs
|
|
]
|
|
|
|
|
|
@router.post("/ingestion")
|
|
def upsert_ingestion_doc(
|
|
doc_info: IngestionDocument,
|
|
_: User | None = Depends(api_key_dep),
|
|
db_session: Session = Depends(get_session),
|
|
tenant_id: str = Depends(get_current_tenant_id),
|
|
) -> IngestionResult:
|
|
doc_info.document.from_ingestion_api = True
|
|
|
|
document = Document.from_base(doc_info.document)
|
|
|
|
# TODO once the frontend is updated with this enum, remove this logic
|
|
if document.source == DocumentSource.INGESTION_API:
|
|
document.source = DocumentSource.FILE
|
|
|
|
cc_pair = get_connector_credential_pair_from_id(
|
|
cc_pair_id=doc_info.cc_pair_id or DEFAULT_CC_PAIR_ID, db_session=db_session
|
|
)
|
|
if cc_pair is None:
|
|
raise HTTPException(
|
|
status_code=400, detail="Connector-Credential Pair specified does not exist"
|
|
)
|
|
|
|
# Need to index for both the primary and secondary index if possible
|
|
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
|
curr_doc_index = get_default_document_index(
|
|
primary_index_name=curr_ind_name, secondary_index_name=None
|
|
)
|
|
|
|
search_settings = get_current_search_settings(db_session)
|
|
|
|
index_embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
|
|
search_settings=search_settings
|
|
)
|
|
|
|
indexing_pipeline = build_indexing_pipeline(
|
|
embedder=index_embedding_model,
|
|
document_index=curr_doc_index,
|
|
ignore_time_skip=True,
|
|
db_session=db_session,
|
|
tenant_id=tenant_id,
|
|
)
|
|
|
|
new_doc, __chunk_count = indexing_pipeline(
|
|
document_batch=[document],
|
|
index_attempt_metadata=IndexAttemptMetadata(
|
|
connector_id=cc_pair.connector_id,
|
|
credential_id=cc_pair.credential_id,
|
|
),
|
|
)
|
|
|
|
# If there's a secondary index being built, index the doc but don't use it for return here
|
|
if sec_ind_name:
|
|
sec_doc_index = get_default_document_index(
|
|
primary_index_name=curr_ind_name, secondary_index_name=None
|
|
)
|
|
|
|
sec_search_settings = get_secondary_search_settings(db_session)
|
|
|
|
if sec_search_settings is None:
|
|
# Should not ever happen
|
|
raise RuntimeError(
|
|
"Secondary index exists but no search settings configured"
|
|
)
|
|
|
|
new_index_embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
|
|
search_settings=sec_search_settings
|
|
)
|
|
|
|
sec_ind_pipeline = build_indexing_pipeline(
|
|
embedder=new_index_embedding_model,
|
|
document_index=sec_doc_index,
|
|
ignore_time_skip=True,
|
|
db_session=db_session,
|
|
tenant_id=tenant_id,
|
|
)
|
|
|
|
sec_ind_pipeline(
|
|
document_batch=[document],
|
|
index_attempt_metadata=IndexAttemptMetadata(
|
|
connector_id=cc_pair.connector_id,
|
|
credential_id=cc_pair.credential_id,
|
|
),
|
|
)
|
|
|
|
return IngestionResult(document_id=document.id, already_existed=not bool(new_doc))
|