mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-19 12:03:54 +02:00
Feature/background processing (#2275)
* first cut at redis * some new helper functions for the db * ignore kombu tables in alembic migrations (used by celery) * multiline commands for readability, add vespa_metadata_sync queue to worker * typo fix * fix returning tuple fields * add constants * fix _get_access_for_document * docstrings! * fix double function declaration and typing * fix type hinting * add a global redis pool * Add get_document function * use task_logger in various celery tasks * add celeryconfig.py to simplify configuration. Will be used in a subsequent commit * Add celery redis helper. used in a subsequent PR * kombu warning getting spammy since celery is not self managing its queue in Postgres any more * add last_modified and last_synced to documents * fix task naming convention * use celeryconfig.py * the big one. adds queues and tasks, updates functions to use the queues with priorities, etc * change vespa index log line to debug * mypy fixes * update alembic migration * fix fence ordering, rename to "monitor", fix fetch_versioned_implementation call * mypy * switch to monotonic time * fix startup dependencies on redis * rebase alembic migration * kombu cleanup - fail silently * mypy * add redis_host environment override * update REDIS_HOST env var in docker-compose.dev.yml * update the rest of the docker files * harden indexing-status endpoint against db changes happening in the background. Needs further improvement but OK for now. * allow no task syncs to run because we create certain objects with no entries but initially marked as out of date * add back writing to vespa on indexing * update contributing guide * backporting fixes from background_deletion * renaming cache to cache_volume * add redis password to various deployments * try setting up pr testing for helm * fix indent * hopefully this release version actually exists * fix command line option to --chart-dirs * fetch-depth 0 * edit values.yaml * try setting ct working directory * bypass testing only on change for now * move files and lint them * update helm testing * some issues suggest using --config works * add vespa repo * add postgresql repo * increase timeout * try amd64 runner * fix redis password reference * add comment to helm chart testing workflow * rename helm testing workflow to disable it * adding clarifying comments * address code review * missed a file * remove commented warning ... just not needed --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
@@ -11,6 +11,17 @@ from ee.danswer.db.user_group import fetch_user_groups_for_documents
|
||||
from ee.danswer.db.user_group import fetch_user_groups_for_user
|
||||
|
||||
|
||||
def _get_access_for_document(
|
||||
document_id: str,
|
||||
db_session: Session,
|
||||
) -> DocumentAccess:
|
||||
id_to_access = _get_access_for_documents([document_id], db_session)
|
||||
if len(id_to_access) == 0:
|
||||
return DocumentAccess.build(user_ids=[], user_groups=[], is_public=False)
|
||||
|
||||
return next(iter(id_to_access.values()))
|
||||
|
||||
|
||||
def _get_access_for_documents(
|
||||
document_ids: list[str],
|
||||
db_session: Session,
|
||||
|
@@ -1,28 +1,18 @@
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
from celery.signals import beat_init
|
||||
from celery.signals import worker_init
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.background.celery.celery_app import celery_app
|
||||
from danswer.background.task_utils import build_celery_task_wrapper
|
||||
from danswer.configs.app_configs import JOB_TIMEOUT
|
||||
from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
|
||||
from danswer.configs.constants import POSTGRES_CELERY_WORKER_APP_NAME
|
||||
from danswer.db.chat import delete_chat_sessions_older_than
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.engine import init_sqlalchemy_engine
|
||||
from danswer.server.settings.store import load_settings
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.variable_functionality import global_version
|
||||
from ee.danswer.background.celery_utils import should_perform_chat_ttl_check
|
||||
from ee.danswer.background.celery_utils import should_sync_user_groups
|
||||
from ee.danswer.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.danswer.background.task_name_builders import name_user_group_sync_task
|
||||
from ee.danswer.db.user_group import fetch_user_groups
|
||||
from ee.danswer.server.reporting.usage_export_generation import create_new_usage_report
|
||||
from ee.danswer.user_groups.sync import sync_user_groups
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -30,17 +20,6 @@ logger = setup_logger()
|
||||
global_version.set_ee()
|
||||
|
||||
|
||||
@build_celery_task_wrapper(name_user_group_sync_task)
|
||||
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
|
||||
def sync_user_group_task(user_group_id: int) -> None:
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
# actual sync logic
|
||||
try:
|
||||
sync_user_groups(user_group_id=user_group_id, db_session=db_session)
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to sync user group - {e}")
|
||||
|
||||
|
||||
@build_celery_task_wrapper(name_chat_ttl_task)
|
||||
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
|
||||
def perform_ttl_management_task(retention_limit_days: int) -> None:
|
||||
@@ -51,8 +30,6 @@ def perform_ttl_management_task(retention_limit_days: int) -> None:
|
||||
#####
|
||||
# Periodic Tasks
|
||||
#####
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="check_ttl_management_task",
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
@@ -69,24 +46,6 @@ def check_ttl_management_task() -> None:
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="check_for_user_groups_sync_task",
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def check_for_user_groups_sync_task() -> None:
|
||||
"""Runs periodically to check if any user groups are out of sync
|
||||
Creates a task to sync the user group if needed"""
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
# check if any document sets are not synced
|
||||
user_groups = fetch_user_groups(db_session=db_session, only_current=False)
|
||||
for user_group in user_groups:
|
||||
if should_sync_user_groups(user_group, db_session):
|
||||
logger.info(f"User Group {user_group.id} is not synced. Syncing now!")
|
||||
sync_user_group_task.apply_async(
|
||||
kwargs=dict(user_group_id=user_group.id),
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="autogenerate_usage_report_task",
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
@@ -101,25 +60,11 @@ def autogenerate_usage_report_task() -> None:
|
||||
)
|
||||
|
||||
|
||||
@beat_init.connect
|
||||
def on_beat_init(sender: Any, **kwargs: Any) -> None:
|
||||
init_sqlalchemy_engine(POSTGRES_CELERY_BEAT_APP_NAME)
|
||||
|
||||
|
||||
@worker_init.connect
|
||||
def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
init_sqlalchemy_engine(POSTGRES_CELERY_WORKER_APP_NAME)
|
||||
|
||||
|
||||
#####
|
||||
# Celery Beat (Periodic Tasks) Settings
|
||||
#####
|
||||
celery_app.conf.beat_schedule = {
|
||||
"check-for-user-group-sync": {
|
||||
"task": "check_for_user_groups_sync_task",
|
||||
"schedule": timedelta(seconds=5),
|
||||
},
|
||||
"autogenerate_usage_report": {
|
||||
"autogenerate-usage-report": {
|
||||
"task": "autogenerate_usage_report_task",
|
||||
"schedule": timedelta(days=30), # TODO: change this to config flag
|
||||
},
|
||||
|
@@ -1,27 +1,13 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.models import UserGroup
|
||||
from danswer.db.tasks import check_task_is_live_and_not_timed_out
|
||||
from danswer.db.tasks import get_latest_task
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.danswer.background.task_name_builders import name_user_group_sync_task
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def should_sync_user_groups(user_group: UserGroup, db_session: Session) -> bool:
|
||||
if user_group.is_up_to_date:
|
||||
return False
|
||||
task_name = name_user_group_sync_task(user_group.id)
|
||||
latest_sync = get_latest_task(task_name, db_session)
|
||||
|
||||
if latest_sync and check_task_is_live_and_not_timed_out(latest_sync, db_session):
|
||||
logger.info("TTL check is already being performed. Skipping.")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def should_perform_chat_ttl_check(
|
||||
retention_limit_days: int | None, db_session: Session
|
||||
) -> bool:
|
||||
|
@@ -5,6 +5,7 @@ from uuid import UUID
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy import Select
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
@@ -81,10 +82,25 @@ def fetch_user_group(db_session: Session, user_group_id: int) -> UserGroup | Non
|
||||
|
||||
|
||||
def fetch_user_groups(
|
||||
db_session: Session, only_current: bool = True
|
||||
db_session: Session, only_up_to_date: bool = True
|
||||
) -> Sequence[UserGroup]:
|
||||
"""
|
||||
Fetches user groups from the database.
|
||||
|
||||
This function retrieves a sequence of `UserGroup` objects from the database.
|
||||
If `only_up_to_date` is set to `True`, it filters the user groups to return only those
|
||||
that are marked as up-to-date (`is_up_to_date` is `True`).
|
||||
|
||||
Args:
|
||||
db_session (Session): The SQLAlchemy session used to query the database.
|
||||
only_up_to_date (bool, optional): Flag to determine whether to filter the results
|
||||
to include only up to date user groups. Defaults to `True`.
|
||||
|
||||
Returns:
|
||||
Sequence[UserGroup]: A sequence of `UserGroup` objects matching the query criteria.
|
||||
"""
|
||||
stmt = select(UserGroup)
|
||||
if only_current:
|
||||
if only_up_to_date:
|
||||
stmt = stmt.where(UserGroup.is_up_to_date == True) # noqa: E712
|
||||
return db_session.scalars(stmt).all()
|
||||
|
||||
@@ -103,6 +119,42 @@ def fetch_user_groups_for_user(
|
||||
return db_session.scalars(stmt).all()
|
||||
|
||||
|
||||
def construct_document_select_by_usergroup(
|
||||
user_group_id: int,
|
||||
) -> Select:
|
||||
"""This returns a statement that should be executed using
|
||||
.yield_per() to minimize overhead. The primary consumers of this function
|
||||
are background processing task generators."""
|
||||
stmt = (
|
||||
select(Document)
|
||||
.join(
|
||||
DocumentByConnectorCredentialPair,
|
||||
Document.id == DocumentByConnectorCredentialPair.id,
|
||||
)
|
||||
.join(
|
||||
ConnectorCredentialPair,
|
||||
and_(
|
||||
DocumentByConnectorCredentialPair.connector_id
|
||||
== ConnectorCredentialPair.connector_id,
|
||||
DocumentByConnectorCredentialPair.credential_id
|
||||
== ConnectorCredentialPair.credential_id,
|
||||
),
|
||||
)
|
||||
.join(
|
||||
UserGroup__ConnectorCredentialPair,
|
||||
UserGroup__ConnectorCredentialPair.cc_pair_id == ConnectorCredentialPair.id,
|
||||
)
|
||||
.join(
|
||||
UserGroup,
|
||||
UserGroup__ConnectorCredentialPair.user_group_id == UserGroup.id,
|
||||
)
|
||||
.where(UserGroup.id == user_group_id)
|
||||
.order_by(Document.id)
|
||||
)
|
||||
stmt = stmt.distinct()
|
||||
return stmt
|
||||
|
||||
|
||||
def fetch_documents_for_user_group_paginated(
|
||||
db_session: Session,
|
||||
user_group_id: int,
|
||||
@@ -361,6 +413,10 @@ def update_user_group(
|
||||
user_group_id: int,
|
||||
user_group_update: UserGroupUpdate,
|
||||
) -> UserGroup:
|
||||
"""If successful, this can set db_user_group.is_up_to_date = False.
|
||||
That will be processed by check_for_vespa_user_groups_sync_task and trigger
|
||||
a long running background sync to Vespa.
|
||||
"""
|
||||
stmt = select(UserGroup).where(UserGroup.id == user_group_id)
|
||||
db_user_group = db_session.scalar(stmt)
|
||||
if db_user_group is None:
|
||||
|
@@ -32,7 +32,7 @@ def list_user_groups(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> list[UserGroup]:
|
||||
if user is None or user.role == UserRole.ADMIN:
|
||||
user_groups = fetch_user_groups(db_session, only_current=False)
|
||||
user_groups = fetch_user_groups(db_session, only_up_to_date=False)
|
||||
else:
|
||||
user_groups = fetch_user_groups_for_user(
|
||||
db_session=db_session,
|
||||
|
@@ -1,87 +0,0 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.access import get_access_for_documents
|
||||
from danswer.db.document import prepare_to_modify_documents
|
||||
from danswer.db.search_settings import get_current_search_settings
|
||||
from danswer.db.search_settings import get_secondary_search_settings
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.document_index.interfaces import DocumentIndex
|
||||
from danswer.document_index.interfaces import UpdateRequest
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.user_group import delete_user_group
|
||||
from ee.danswer.db.user_group import fetch_documents_for_user_group_paginated
|
||||
from ee.danswer.db.user_group import fetch_user_group
|
||||
from ee.danswer.db.user_group import mark_user_group_as_synced
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_SYNC_BATCH_SIZE = 100
|
||||
|
||||
|
||||
def _sync_user_group_batch(
|
||||
document_ids: list[str], document_index: DocumentIndex, db_session: Session
|
||||
) -> None:
|
||||
logger.debug(f"Syncing document sets for: {document_ids}")
|
||||
|
||||
# Acquires a lock on the documents so that no other process can modify them
|
||||
with prepare_to_modify_documents(db_session=db_session, document_ids=document_ids):
|
||||
# get current state of document sets for these documents
|
||||
document_id_to_access = get_access_for_documents(
|
||||
document_ids=document_ids, db_session=db_session
|
||||
)
|
||||
|
||||
# update Vespa
|
||||
document_index.update(
|
||||
update_requests=[
|
||||
UpdateRequest(
|
||||
document_ids=[document_id],
|
||||
access=document_id_to_access[document_id],
|
||||
)
|
||||
for document_id in document_ids
|
||||
]
|
||||
)
|
||||
|
||||
# Finish the transaction and release the locks
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def sync_user_groups(user_group_id: int, db_session: Session) -> None:
|
||||
"""Sync the status of Postgres for the specified user group"""
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name,
|
||||
secondary_index_name=secondary_search_settings.index_name
|
||||
if secondary_search_settings
|
||||
else None,
|
||||
)
|
||||
|
||||
user_group = fetch_user_group(db_session=db_session, user_group_id=user_group_id)
|
||||
if user_group is None:
|
||||
raise ValueError(f"User group '{user_group_id}' does not exist")
|
||||
|
||||
cursor = None
|
||||
while True:
|
||||
# NOTE: this may miss some documents, but that is okay. Any new documents added
|
||||
# will be added with the correct group membership
|
||||
document_batch, cursor = fetch_documents_for_user_group_paginated(
|
||||
db_session=db_session,
|
||||
user_group_id=user_group_id,
|
||||
last_document_id=cursor,
|
||||
limit=_SYNC_BATCH_SIZE,
|
||||
)
|
||||
|
||||
_sync_user_group_batch(
|
||||
document_ids=[document.id for document in document_batch],
|
||||
document_index=document_index,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
if cursor is None:
|
||||
break
|
||||
|
||||
if user_group.is_up_for_deletion:
|
||||
delete_user_group(db_session=db_session, user_group=user_group)
|
||||
else:
|
||||
mark_user_group_as_synced(db_session=db_session, user_group=user_group)
|
Reference in New Issue
Block a user