Add ingestion metrics (#256)

2025-09-27 20:38:32 +02:00 · 2023-07-29 16:37:22 -07:00
parent eec4e21bad
commit 87fe6f7575
5 changed files with 72 additions and 25 deletions
--- a/backend/alembic/versions/d7111c1238cd_remove_document_ids.py
+++ b/backend/alembic/versions/d7111c1238cd_remove_document_ids.py
@@ -0,0 +1,32 @@
+"""Remove Document IDs
+
+Revision ID: d7111c1238cd
+Revises: 465f78d9b7f9
+Create Date: 2023-07-29 15:06:25.126169
+
+"""
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "d7111c1238cd"
+down_revision = "465f78d9b7f9"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.drop_column("index_attempt", "document_ids")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "document_ids",
+            postgresql.ARRAY(sa.VARCHAR()),
+            autoincrement=False,
+            nullable=True,
+        ),
+    )
--- a/backend/danswer/background/update.py
+++ b/backend/danswer/background/update.py
@@ -115,6 +115,16 @@ def run_indexing_jobs(db_session: Session) -> None:
            f"with config: '{attempt.connector.connector_specific_config}', and "
            f"with credentials: '{attempt.credential_id}'"
        )
+
+        run_time = time.time()
+        run_time_str = datetime.utcfromtimestamp(run_time).strftime("%Y-%m-%d %H:%M:%S")
+        logger.info(f"Connector Starting UTC Time: {run_time_str}")
+
+        # "official" timestamp for this run
+        # used for setting time bounds when fetching updates from apps and
+        # is stored in the DB as the last successful run time if this run succeeds
+        run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
+
        mark_attempt_in_progress(attempt, db_session)

        db_connector = attempt.connector
@@ -148,11 +158,6 @@ def run_indexing_jobs(db_session: Session) -> None:

        net_doc_change = 0
        try:
-            # "official" timestamp for this run
-            # used for setting time bounds when fetching updates from apps + is
-            # stored in the DB as the last successful run time if this run succeeds
-            run_time = time.time()
-            run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
            if task == InputType.LOAD_STATE:
                assert isinstance(runnable_connector, LoadConnector)
                doc_batch_generator = runnable_connector.load_from_state()
@@ -175,17 +180,20 @@ def run_indexing_jobs(db_session: Session) -> None:
                # Event types cannot be handled by a background type, leave these untouched
                continue

-            document_ids: list[str] = []
+            document_count = 0
+            chunk_count = 0
            for doc_batch in doc_batch_generator:
                index_user_id = (
                    None if db_credential.public_doc else db_credential.user_id
                )
-                net_doc_change += indexing_pipeline(
+                new_docs, total_batch_chunks = indexing_pipeline(
                    documents=doc_batch, user_id=index_user_id
                )
-                document_ids.extend([doc.id for doc in doc_batch])
+                net_doc_change += new_docs
+                chunk_count += total_batch_chunks
+                document_count += len(doc_batch)

-            mark_attempt_succeeded(attempt, document_ids, db_session)
+            mark_attempt_succeeded(attempt, db_session)
            update_connector_credential_pair(
                connector_id=db_connector.id,
                credential_id=db_credential.id,
@@ -195,10 +203,18 @@ def run_indexing_jobs(db_session: Session) -> None:
                db_session=db_session,
            )

-            logger.info(f"Indexed {len(document_ids)} documents")
+            logger.info(
+                f"Indexed or updated {document_count} total documents for a total of {chunk_count} chunks"
+            )
+            logger.info(
+                f"Connector successfully finished, elapsed time: {time.time() - run_time} seconds"
+            )

        except Exception as e:
            logger.exception(f"Indexing job with id {attempt.id} failed due to {e}")
+            logger.info(
+                f"Failed connector elapsed time: {time.time() - run_time} seconds"
+            )
            mark_attempt_failed(attempt, db_session, failure_reason=str(e))
            update_connector_credential_pair(
                connector_id=db_connector.id,
@@ -214,7 +230,8 @@ def update_loop(delay: int = 10) -> None:
    engine = get_sqlalchemy_engine()
    while True:
        start = time.time()
-        logger.info(f"Running update, current time: {time.ctime(start)}")
+        start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
+        logger.info(f"Running update, current UTC time: {start_time_utc}")
        try:
            with Session(engine, expire_on_commit=False) as db_session:
                create_indexing_jobs(db_session)
--- a/backend/danswer/datastores/indexing_pipeline.py
+++ b/backend/danswer/datastores/indexing_pipeline.py
@@ -18,7 +18,9 @@ logger = setup_logger()


 class IndexingPipelineProtocol(Protocol):
-    def __call__(self, documents: list[Document], user_id: UUID | None) -> int:
+    def __call__(
+        self, documents: list[Document], user_id: UUID | None
+    ) -> tuple[int, int]:
        ...


@@ -30,7 +32,10 @@ def _indexing_pipeline(
    keyword_index: KeywordIndex,
    documents: list[Document],
    user_id: UUID | None,
-) -> int:
+) -> tuple[int, int]:
+    """Takes different pieces of the indexing pipeline and applies it to a batch of documents
+    Note that the documents should already be batched at this point so that it does not inflate the
+    memory requirements"""
    # TODO: make entire indexing pipeline async to not block the entire process
    # when running on async endpoints
    chunks = list(chain(*[chunker.chunk(document) for document in documents]))
@@ -42,7 +47,7 @@ def _indexing_pipeline(
        logger.warning("Document count change from keyword/vector indices don't align")
    net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
    logger.info(f"Indexed {net_new_docs} new documents")
-    return net_new_docs
+    return net_new_docs, len(chunks)


 def build_indexing_pipeline(
@@ -52,7 +57,7 @@ def build_indexing_pipeline(
    vector_index: VectorIndex | None = None,
    keyword_index: KeywordIndex | None = None,
 ) -> IndexingPipelineProtocol:
-    """Builds a pipline which takes in a list of docs and indexes them.
+    """Builds a pipline which takes in a list (batch) of docs and indexes them.

    Default uses _ chunker, _ embedder, and qdrant for the datastore"""
    if chunker is None:
--- a/backend/danswer/db/index_attempt.py
+++ b/backend/danswer/db/index_attempt.py
@@ -56,11 +56,9 @@ def mark_attempt_in_progress(

 def mark_attempt_succeeded(
    index_attempt: IndexAttempt,
-    docs_indexed: list[str],
    db_session: Session,
 ) -> None:
    index_attempt.status = IndexingStatus.SUCCESS
-    index_attempt.document_ids = docs_indexed
    db_session.add(index_attempt)
    db_session.commit()

--- a/backend/danswer/db/models.py
+++ b/backend/danswer/db/models.py
@@ -4,6 +4,9 @@ from typing import Any
 from typing import List
 from uuid import UUID

+from danswer.auth.schemas import UserRole
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.models import InputType
 from fastapi_users.db import SQLAlchemyBaseOAuthAccountTableUUID
 from fastapi_users.db import SQLAlchemyBaseUserTableUUID
 from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyBaseAccessTokenTableUUID
@@ -21,10 +24,6 @@ from sqlalchemy.orm import Mapped
 from sqlalchemy.orm import mapped_column
 from sqlalchemy.orm import relationship

-from danswer.auth.schemas import UserRole
-from danswer.configs.constants import DocumentSource
-from danswer.connectors.models import InputType
-

 class IndexingStatus(str, PyEnum):
    NOT_STARTED = "not_started"
@@ -161,9 +160,6 @@ class IndexAttempt(Base):
        nullable=True,
    )
    status: Mapped[IndexingStatus] = mapped_column(Enum(IndexingStatus))
-    document_ids: Mapped[list[str] | None] = mapped_column(
-        postgresql.ARRAY(String()), default=None
-    )  # only filled if status = "complete"
    error_msg: Mapped[str | None] = mapped_column(
        String(), default=None
    )  # only filled if status = "failed"
@@ -189,7 +185,6 @@ class IndexAttempt(Base):
            f"<IndexAttempt(id={self.id!r}, "
            f"connector_id={self.connector_id!r}, "
            f"status={self.status!r}, "
-            f"document_ids={self.document_ids!r}, "
            f"error_msg={self.error_msg!r})>"
            f"time_created={self.time_created!r}, "
            f"time_updated={self.time_updated!r}, "