mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
Add ingestion metrics (#256)
This commit is contained in:
32
backend/alembic/versions/d7111c1238cd_remove_document_ids.py
Normal file
32
backend/alembic/versions/d7111c1238cd_remove_document_ids.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Remove Document IDs
|
||||
|
||||
Revision ID: d7111c1238cd
|
||||
Revises: 465f78d9b7f9
|
||||
Create Date: 2023-07-29 15:06:25.126169
|
||||
|
||||
"""
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "d7111c1238cd"
|
||||
down_revision = "465f78d9b7f9"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("index_attempt", "document_ids")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"document_ids",
|
||||
postgresql.ARRAY(sa.VARCHAR()),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
@@ -115,6 +115,16 @@ def run_indexing_jobs(db_session: Session) -> None:
|
||||
f"with config: '{attempt.connector.connector_specific_config}', and "
|
||||
f"with credentials: '{attempt.credential_id}'"
|
||||
)
|
||||
|
||||
run_time = time.time()
|
||||
run_time_str = datetime.utcfromtimestamp(run_time).strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.info(f"Connector Starting UTC Time: {run_time_str}")
|
||||
|
||||
# "official" timestamp for this run
|
||||
# used for setting time bounds when fetching updates from apps and
|
||||
# is stored in the DB as the last successful run time if this run succeeds
|
||||
run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
|
||||
|
||||
mark_attempt_in_progress(attempt, db_session)
|
||||
|
||||
db_connector = attempt.connector
|
||||
@@ -148,11 +158,6 @@ def run_indexing_jobs(db_session: Session) -> None:
|
||||
|
||||
net_doc_change = 0
|
||||
try:
|
||||
# "official" timestamp for this run
|
||||
# used for setting time bounds when fetching updates from apps + is
|
||||
# stored in the DB as the last successful run time if this run succeeds
|
||||
run_time = time.time()
|
||||
run_dt = datetime.fromtimestamp(run_time, tz=timezone.utc)
|
||||
if task == InputType.LOAD_STATE:
|
||||
assert isinstance(runnable_connector, LoadConnector)
|
||||
doc_batch_generator = runnable_connector.load_from_state()
|
||||
@@ -175,17 +180,20 @@ def run_indexing_jobs(db_session: Session) -> None:
|
||||
# Event types cannot be handled by a background type, leave these untouched
|
||||
continue
|
||||
|
||||
document_ids: list[str] = []
|
||||
document_count = 0
|
||||
chunk_count = 0
|
||||
for doc_batch in doc_batch_generator:
|
||||
index_user_id = (
|
||||
None if db_credential.public_doc else db_credential.user_id
|
||||
)
|
||||
net_doc_change += indexing_pipeline(
|
||||
new_docs, total_batch_chunks = indexing_pipeline(
|
||||
documents=doc_batch, user_id=index_user_id
|
||||
)
|
||||
document_ids.extend([doc.id for doc in doc_batch])
|
||||
net_doc_change += new_docs
|
||||
chunk_count += total_batch_chunks
|
||||
document_count += len(doc_batch)
|
||||
|
||||
mark_attempt_succeeded(attempt, document_ids, db_session)
|
||||
mark_attempt_succeeded(attempt, db_session)
|
||||
update_connector_credential_pair(
|
||||
connector_id=db_connector.id,
|
||||
credential_id=db_credential.id,
|
||||
@@ -195,10 +203,18 @@ def run_indexing_jobs(db_session: Session) -> None:
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
logger.info(f"Indexed {len(document_ids)} documents")
|
||||
logger.info(
|
||||
f"Indexed or updated {document_count} total documents for a total of {chunk_count} chunks"
|
||||
)
|
||||
logger.info(
|
||||
f"Connector successfully finished, elapsed time: {time.time() - run_time} seconds"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Indexing job with id {attempt.id} failed due to {e}")
|
||||
logger.info(
|
||||
f"Failed connector elapsed time: {time.time() - run_time} seconds"
|
||||
)
|
||||
mark_attempt_failed(attempt, db_session, failure_reason=str(e))
|
||||
update_connector_credential_pair(
|
||||
connector_id=db_connector.id,
|
||||
@@ -214,7 +230,8 @@ def update_loop(delay: int = 10) -> None:
|
||||
engine = get_sqlalchemy_engine()
|
||||
while True:
|
||||
start = time.time()
|
||||
logger.info(f"Running update, current time: {time.ctime(start)}")
|
||||
start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.info(f"Running update, current UTC time: {start_time_utc}")
|
||||
try:
|
||||
with Session(engine, expire_on_commit=False) as db_session:
|
||||
create_indexing_jobs(db_session)
|
||||
|
@@ -18,7 +18,9 @@ logger = setup_logger()
|
||||
|
||||
|
||||
class IndexingPipelineProtocol(Protocol):
|
||||
def __call__(self, documents: list[Document], user_id: UUID | None) -> int:
|
||||
def __call__(
|
||||
self, documents: list[Document], user_id: UUID | None
|
||||
) -> tuple[int, int]:
|
||||
...
|
||||
|
||||
|
||||
@@ -30,7 +32,10 @@ def _indexing_pipeline(
|
||||
keyword_index: KeywordIndex,
|
||||
documents: list[Document],
|
||||
user_id: UUID | None,
|
||||
) -> int:
|
||||
) -> tuple[int, int]:
|
||||
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
|
||||
Note that the documents should already be batched at this point so that it does not inflate the
|
||||
memory requirements"""
|
||||
# TODO: make entire indexing pipeline async to not block the entire process
|
||||
# when running on async endpoints
|
||||
chunks = list(chain(*[chunker.chunk(document) for document in documents]))
|
||||
@@ -42,7 +47,7 @@ def _indexing_pipeline(
|
||||
logger.warning("Document count change from keyword/vector indices don't align")
|
||||
net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
|
||||
logger.info(f"Indexed {net_new_docs} new documents")
|
||||
return net_new_docs
|
||||
return net_new_docs, len(chunks)
|
||||
|
||||
|
||||
def build_indexing_pipeline(
|
||||
@@ -52,7 +57,7 @@ def build_indexing_pipeline(
|
||||
vector_index: VectorIndex | None = None,
|
||||
keyword_index: KeywordIndex | None = None,
|
||||
) -> IndexingPipelineProtocol:
|
||||
"""Builds a pipline which takes in a list of docs and indexes them.
|
||||
"""Builds a pipline which takes in a list (batch) of docs and indexes them.
|
||||
|
||||
Default uses _ chunker, _ embedder, and qdrant for the datastore"""
|
||||
if chunker is None:
|
||||
|
@@ -56,11 +56,9 @@ def mark_attempt_in_progress(
|
||||
|
||||
def mark_attempt_succeeded(
|
||||
index_attempt: IndexAttempt,
|
||||
docs_indexed: list[str],
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
index_attempt.status = IndexingStatus.SUCCESS
|
||||
index_attempt.document_ids = docs_indexed
|
||||
db_session.add(index_attempt)
|
||||
db_session.commit()
|
||||
|
||||
|
@@ -4,6 +4,9 @@ from typing import Any
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import InputType
|
||||
from fastapi_users.db import SQLAlchemyBaseOAuthAccountTableUUID
|
||||
from fastapi_users.db import SQLAlchemyBaseUserTableUUID
|
||||
from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyBaseAccessTokenTableUUID
|
||||
@@ -21,10 +24,6 @@ from sqlalchemy.orm import Mapped
|
||||
from sqlalchemy.orm import mapped_column
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import InputType
|
||||
|
||||
|
||||
class IndexingStatus(str, PyEnum):
|
||||
NOT_STARTED = "not_started"
|
||||
@@ -161,9 +160,6 @@ class IndexAttempt(Base):
|
||||
nullable=True,
|
||||
)
|
||||
status: Mapped[IndexingStatus] = mapped_column(Enum(IndexingStatus))
|
||||
document_ids: Mapped[list[str] | None] = mapped_column(
|
||||
postgresql.ARRAY(String()), default=None
|
||||
) # only filled if status = "complete"
|
||||
error_msg: Mapped[str | None] = mapped_column(
|
||||
String(), default=None
|
||||
) # only filled if status = "failed"
|
||||
@@ -189,7 +185,6 @@ class IndexAttempt(Base):
|
||||
f"<IndexAttempt(id={self.id!r}, "
|
||||
f"connector_id={self.connector_id!r}, "
|
||||
f"status={self.status!r}, "
|
||||
f"document_ids={self.document_ids!r}, "
|
||||
f"error_msg={self.error_msg!r})>"
|
||||
f"time_created={self.time_created!r}, "
|
||||
f"time_updated={self.time_updated!r}, "
|
||||
|
Reference in New Issue
Block a user