mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-08-03 21:52:29 +02:00
Fix document counts (#3671)
* Various fixes/improvements to document counting * Add new column + index * Avoid double scan * comment fixes * Fix revision history * Fix IT * Fix IT * Fix migration * Rebase
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import contextlib
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
@@ -13,6 +14,7 @@ from sqlalchemy import or_
|
||||
from sqlalchemy import Select
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import tuple_
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from sqlalchemy.engine.util import TransactionalContext
|
||||
from sqlalchemy.exc import OperationalError
|
||||
@@ -226,10 +228,13 @@ def get_document_counts_for_cc_pairs(
|
||||
func.count(),
|
||||
)
|
||||
.where(
|
||||
tuple_(
|
||||
DocumentByConnectorCredentialPair.connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id,
|
||||
).in_(cc_ids)
|
||||
and_(
|
||||
tuple_(
|
||||
DocumentByConnectorCredentialPair.connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id,
|
||||
).in_(cc_ids),
|
||||
DocumentByConnectorCredentialPair.has_been_indexed.is_(True),
|
||||
)
|
||||
)
|
||||
.group_by(
|
||||
DocumentByConnectorCredentialPair.connector_id,
|
||||
@@ -382,18 +387,40 @@ def upsert_document_by_connector_credential_pair(
|
||||
id=doc_id,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
has_been_indexed=False,
|
||||
)
|
||||
)
|
||||
for doc_id in document_ids
|
||||
]
|
||||
)
|
||||
# for now, there are no columns to update. If more metadata is added, then this
|
||||
# needs to change to an `on_conflict_do_update`
|
||||
# this must be `on_conflict_do_nothing` rather than `on_conflict_do_update`
|
||||
# since we don't want to update the `has_been_indexed` field for documents
|
||||
# that already exist
|
||||
on_conflict_stmt = insert_stmt.on_conflict_do_nothing()
|
||||
db_session.execute(on_conflict_stmt)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def mark_document_as_indexed_for_cc_pair__no_commit(
|
||||
db_session: Session,
|
||||
connector_id: int,
|
||||
credential_id: int,
|
||||
document_ids: Iterable[str],
|
||||
) -> None:
|
||||
"""Should be called only after a successful index operation for a batch."""
|
||||
db_session.execute(
|
||||
update(DocumentByConnectorCredentialPair)
|
||||
.where(
|
||||
and_(
|
||||
DocumentByConnectorCredentialPair.connector_id == connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id == credential_id,
|
||||
DocumentByConnectorCredentialPair.id.in_(document_ids),
|
||||
)
|
||||
)
|
||||
.values(has_been_indexed=True)
|
||||
)
|
||||
|
||||
|
||||
def update_docs_updated_at__no_commit(
|
||||
ids_to_new_updated_at: dict[str, datetime],
|
||||
db_session: Session,
|
||||
|
Reference in New Issue
Block a user