Add ingestion metrics (#256)

2025-10-06 18:14:35 +02:00 · 2023-07-29 16:37:22 -07:00
parent eec4e21bad
commit 87fe6f7575
5 changed files with 72 additions and 25 deletions
--- a/backend/alembic/versions/d7111c1238cd_remove_document_ids.py
+++ b/backend/alembic/versions/d7111c1238cd_remove_document_ids.py
@@ -0,0 +1,32 @@
 """Remove Document IDs
 Revision ID: d7111c1238cd
 Revises: 465f78d9b7f9
 Create Date: 2023-07-29 15:06:25.126169
 """
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
 revision = "d7111c1238cd"
 down_revision = "465f78d9b7f9"
 branch_labels = None
 depends_on = None
 def upgrade() -> None:
    op.drop_column("index_attempt", "document_ids")
 def downgrade() -> None:
    op.add_column(
        "index_attempt",
        sa.Column(
            "document_ids",
            postgresql.ARRAY(sa.VARCHAR()),
            autoincrement=False,
            nullable=True,
        ),
    )
--- a/backend/danswer/background/update.py
+++ b/backend/danswer/background/update.py
@@ -115,6 +115,16 @@ def run_indexing_jobs(db_session: Session) -> None:
            f"with config: '{attempt.connector.connector_specific_config}', and "
            f"with credentials: '{attempt.credential_id}'"
        )
        run_time = time.time()
        run_time_str = datetime.utcfromtimestamp(run_time).strftime("%Y-%m-%d %H:%M:%S")
        logger.info(f"Connector Starting UTC Time: {run_time_str}")
        # "official" timestamp for this run
        # used for setting time bounds when fetching updates from apps and
        # is stored in the DB as the last successful run time if this run succeeds
        run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
        mark_attempt_in_progress(attempt, db_session)
        db_connector = attempt.connector
@@ -148,11 +158,6 @@ def run_indexing_jobs(db_session: Session) -> None:
        net_doc_change = 0
        try:
            # "official" timestamp for this run
            # used for setting time bounds when fetching updates from apps + is
            # stored in the DB as the last successful run time if this run succeeds
            run_time = time.time()
            run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
            if task == InputType.LOAD_STATE:
                assert isinstance(runnable_connector, LoadConnector)
                doc_batch_generator = runnable_connector.load_from_state()
@@ -175,17 +180,20 @@ def run_indexing_jobs(db_session: Session) -> None:
                # Event types cannot be handled by a background type, leave these untouched
                continue
-            document_ids: list[str] = []
+            document_count = 0
            chunk_count = 0
            for doc_batch in doc_batch_generator:
                index_user_id = (
                    None if db_credential.public_doc else db_credential.user_id
                )
-                net_doc_change += indexing_pipeline(
+                new_docs, total_batch_chunks = indexing_pipeline(
                    documents=doc_batch, user_id=index_user_id
                )
-                document_ids.extend([doc.id for doc in doc_batch])
+                net_doc_change += new_docs
                chunk_count += total_batch_chunks
                document_count += len(doc_batch)
-            mark_attempt_succeeded(attempt, document_ids, db_session)
+            mark_attempt_succeeded(attempt, db_session)
            update_connector_credential_pair(
                connector_id=db_connector.id,
                credential_id=db_credential.id,
@@ -195,10 +203,18 @@ def run_indexing_jobs(db_session: Session) -> None:
                db_session=db_session,
            )
-            logger.info(f"Indexed {len(document_ids)} documents")
+            logger.info(
                f"Indexed or updated {document_count} total documents for a total of {chunk_count} chunks"
            )
            logger.info(
                f"Connector successfully finished, elapsed time: {time.time() - run_time} seconds"
            )
        except Exception as e:
            logger.exception(f"Indexing job with id {attempt.id} failed due to {e}")
            logger.info(
                f"Failed connector elapsed time: {time.time() - run_time} seconds"
            )
            mark_attempt_failed(attempt, db_session, failure_reason=str(e))
            update_connector_credential_pair(
                connector_id=db_connector.id,
@@ -214,7 +230,8 @@ def update_loop(delay: int = 10) -> None:
    engine = get_sqlalchemy_engine()
    while True:
        start = time.time()
-        logger.info(f"Running update, current time: {time.ctime(start)}")
+        start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
        logger.info(f"Running update, current UTC time: {start_time_utc}")
        try:
            with Session(engine, expire_on_commit=False) as db_session:
                create_indexing_jobs(db_session)
--- a/backend/danswer/datastores/indexing_pipeline.py
+++ b/backend/danswer/datastores/indexing_pipeline.py
@@ -18,7 +18,9 @@ logger = setup_logger()
 class IndexingPipelineProtocol(Protocol):
-    def __call__(self, documents: list[Document], user_id: UUID | None) -> int:
+    def __call__(
        self, documents: list[Document], user_id: UUID | None
    ) -> tuple[int, int]:
        ...
@@ -30,7 +32,10 @@ def _indexing_pipeline(
    keyword_index: KeywordIndex,
    documents: list[Document],
    user_id: UUID | None,
-) -> int:
+) -> tuple[int, int]:
    """Takes different pieces of the indexing pipeline and applies it to a batch of documents
    Note that the documents should already be batched at this point so that it does not inflate the
    memory requirements"""
    # TODO: make entire indexing pipeline async to not block the entire process
    # when running on async endpoints
    chunks = list(chain(*[chunker.chunk(document) for document in documents]))
@@ -42,7 +47,7 @@ def _indexing_pipeline(
        logger.warning("Document count change from keyword/vector indices don't align")
    net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
    logger.info(f"Indexed {net_new_docs} new documents")
-    return net_new_docs
+    return net_new_docs, len(chunks)
 def build_indexing_pipeline(
@@ -52,7 +57,7 @@ def build_indexing_pipeline(
    vector_index: VectorIndex | None = None,
    keyword_index: KeywordIndex | None = None,
 ) -> IndexingPipelineProtocol:
-    """Builds a pipline which takes in a list of docs and indexes them.
+    """Builds a pipline which takes in a list (batch) of docs and indexes them.
    Default uses _ chunker, _ embedder, and qdrant for the datastore"""
    if chunker is None:
--- a/backend/danswer/db/index_attempt.py
+++ b/backend/danswer/db/index_attempt.py
@@ -56,11 +56,9 @@ def mark_attempt_in_progress(
 def mark_attempt_succeeded(
    index_attempt: IndexAttempt,
    docs_indexed: list[str],
    db_session: Session,
 ) -> None:
    index_attempt.status = IndexingStatus.SUCCESS
    index_attempt.document_ids = docs_indexed
    db_session.add(index_attempt)
    db_session.commit()
--- a/backend/danswer/db/models.py
+++ b/backend/danswer/db/models.py
@@ -4,6 +4,9 @@ from typing import Any
 from typing import List
 from uuid import UUID
 from danswer.auth.schemas import UserRole
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.models import InputType
 from fastapi_users.db import SQLAlchemyBaseOAuthAccountTableUUID
 from fastapi_users.db import SQLAlchemyBaseUserTableUUID
 from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyBaseAccessTokenTableUUID
@@ -21,10 +24,6 @@ from sqlalchemy.orm import Mapped
 from sqlalchemy.orm import mapped_column
 from sqlalchemy.orm import relationship
 from danswer.auth.schemas import UserRole
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.models import InputType
 class IndexingStatus(str, PyEnum):
    NOT_STARTED = "not_started"
@@ -161,9 +160,6 @@ class IndexAttempt(Base):
        nullable=True,
    )
    status: Mapped[IndexingStatus] = mapped_column(Enum(IndexingStatus))
    document_ids: Mapped[list[str] | None] = mapped_column(
        postgresql.ARRAY(String()), default=None
    )  # only filled if status = "complete"
    error_msg: Mapped[str | None] = mapped_column(
        String(), default=None
    )  # only filled if status = "failed"
@@ -189,7 +185,6 @@ class IndexAttempt(Base):
            f"<IndexAttempt(id={self.id!r}, "
            f"connector_id={self.connector_id!r}, "
            f"status={self.status!r}, "
            f"document_ids={self.document_ids!r}, "
            f"error_msg={self.error_msg!r})>"
            f"time_created={self.time_created!r}, "
            f"time_updated={self.time_updated!r}, "