Make docs indexed cnt more accurate (#579)

This commit is contained in:
Chris Weaver 2023-10-16 20:18:19 -07:00 committed by GitHub
parent bb9a18b22c
commit 37e9ccf864
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 88 additions and 7 deletions

View File

@ -0,0 +1,35 @@
"""Add index for getting documents just by connector id / credential id
Revision ID: 7f99be1cb9f5
Revises: 78dbe7e38469
Create Date: 2023-10-15 22:48:15.487762
"""
from alembic import op
# revision identifiers, used by Alembic.
revision = "7f99be1cb9f5"
down_revision = "78dbe7e38469"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_index(
op.f(
"ix_document_by_connector_credential_pair_pkey__connector_id__credential_id"
),
"document_by_connector_credential_pair",
["connector_id", "credential_id"],
unique=False,
)
def downgrade() -> None:
op.drop_index(
op.f(
"ix_document_by_connector_credential_pair_pkey__connector_id__credential_id"
),
table_name="document_by_connector_credential_pair",
)

View File

@ -5,6 +5,7 @@ from uuid import UUID
from sqlalchemy import and_
from sqlalchemy import delete
from sqlalchemy import func
from sqlalchemy import or_
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session
@ -53,6 +54,37 @@ def get_document_connector_cnts(
return db_session.execute(stmt).all() # type: ignore
def get_document_cnts_for_cc_pairs(
db_session: Session, cc_pair_identifiers: list[ConnectorCredentialPairIdentifier]
) -> Sequence[tuple[int, int, int]]:
stmt = (
select(
DocumentByConnectorCredentialPair.connector_id,
DocumentByConnectorCredentialPair.credential_id,
func.count(),
)
.where(
or_(
*[
and_(
DocumentByConnectorCredentialPair.connector_id
== cc_pair_identifier.connector_id,
DocumentByConnectorCredentialPair.credential_id
== cc_pair_identifier.credential_id,
)
for cc_pair_identifier in cc_pair_identifiers
]
)
)
.group_by(
DocumentByConnectorCredentialPair.connector_id,
DocumentByConnectorCredentialPair.credential_id,
)
)
return db_session.execute(stmt).all() # type: ignore
def get_acccess_info_for_documents(
db_session: Session,
document_ids: list[str],

View File

@ -50,6 +50,7 @@ from danswer.db.credentials import create_credential
from danswer.db.credentials import delete_google_drive_service_account_credentials
from danswer.db.credentials import fetch_credential_by_id
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
from danswer.db.document import get_document_cnts_for_cc_pairs
from danswer.db.engine import get_session
from danswer.db.feedback import fetch_docs_ranked_by_boost
from danswer.db.feedback import update_document_boost
@ -294,20 +295,31 @@ def get_connector_indexing_status(
# TODO: make this one query
cc_pairs = get_connector_credential_pairs(db_session)
cc_pair_identifiers = [
ConnectorCredentialPairIdentifier(
connector_id=cc_pair.connector_id, credential_id=cc_pair.credential_id
)
for cc_pair in cc_pairs
]
latest_index_attempts = get_latest_index_attempts(
db_session=db_session,
connector_credential_pair_identifiers=[
ConnectorCredentialPairIdentifier(
connector_id=cc_pair.connector_id, credential_id=cc_pair.credential_id
)
for cc_pair in cc_pairs
],
connector_credential_pair_identifiers=cc_pair_identifiers,
)
cc_pair_to_latest_index_attempt = {
(index_attempt.connector_id, index_attempt.credential_id): index_attempt
for index_attempt in latest_index_attempts
}
document_count_info = get_document_cnts_for_cc_pairs(
db_session=db_session,
cc_pair_identifiers=cc_pair_identifiers,
)
cc_pair_to_document_cnt = {
(connector_id, credential_id): cnt
for connector_id, credential_id, cnt in document_count_info
}
for cc_pair in cc_pairs:
connector = cc_pair.connector
credential = cc_pair.credential
@ -324,7 +336,9 @@ def get_connector_indexing_status(
owner=credential.user.email if credential.user else "",
last_status=cc_pair.last_attempt_status,
last_success=cc_pair.last_successful_index_time,
docs_indexed=cc_pair.total_docs_indexed,
docs_indexed=cc_pair_to_document_cnt.get(
(connector.id, credential.id), 0
),
error_msg=latest_index_attempt.error_msg
if latest_index_attempt
else None,