Feature/background processing (#2275)

* first cut at redis

* some new helper functions for the db

* ignore kombu tables in alembic migrations (used by celery)

* multiline commands for readability, add vespa_metadata_sync queue to worker

* typo fix

* fix returning tuple fields

* add constants

* fix _get_access_for_document

* docstrings!

* fix double function declaration and typing

* fix type hinting

* add a global redis pool

* Add get_document function

* use task_logger in various celery tasks

* add celeryconfig.py to simplify configuration. Will be used in a subsequent commit

* Add celery redis helper. used in a subsequent PR

* kombu warning getting spammy since celery is not self managing its queue in Postgres any more

* add last_modified and last_synced to documents

* fix task naming convention

* use celeryconfig.py

* the big one. adds queues and tasks, updates functions to use the queues with priorities, etc

* change vespa index log line to debug

* mypy fixes

* update alembic migration

* fix fence ordering, rename to "monitor", fix fetch_versioned_implementation call

* mypy

* switch to monotonic time

* fix startup dependencies on redis

* rebase alembic migration

* kombu cleanup - fail silently

* mypy

* add redis_host environment override

* update REDIS_HOST env var in docker-compose.dev.yml

* update the rest of the docker files

* harden indexing-status endpoint against db changes happening in the background.  Needs further improvement but OK for now.

* allow no task syncs to run because we create certain objects with no entries but initially marked as out of date

* add back writing to vespa on indexing

* update contributing guide

* backporting fixes from background_deletion

* renaming cache to cache_volume

* add redis password to various deployments

* try setting up pr testing for helm

* fix indent

* hopefully this release version actually exists

* fix command line option to --chart-dirs

* fetch-depth 0

* edit values.yaml

* try setting ct working directory

* bypass testing only on change for now

* move files and lint them

* update helm testing

* some issues suggest using --config works

* add vespa repo

* add postgresql repo

* increase timeout

* try amd64 runner

* fix redis password reference

* add comment to helm chart testing workflow

* rename helm testing workflow to disable it

* adding clarifying comments

* address code review

* missed a file

* remove commented warning ... just not needed

---------

Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
rkuo-danswer
2024-09-10 09:28:19 -07:00
committed by GitHub
parent b7ad810d83
commit f1c5e80f17
26 changed files with 1428 additions and 350 deletions

View File

@@ -11,6 +11,17 @@ from ee.danswer.db.user_group import fetch_user_groups_for_documents
from ee.danswer.db.user_group import fetch_user_groups_for_user
def _get_access_for_document(
document_id: str,
db_session: Session,
) -> DocumentAccess:
id_to_access = _get_access_for_documents([document_id], db_session)
if len(id_to_access) == 0:
return DocumentAccess.build(user_ids=[], user_groups=[], is_public=False)
return next(iter(id_to_access.values()))
def _get_access_for_documents(
document_ids: list[str],
db_session: Session,

View File

@@ -1,28 +1,18 @@
from datetime import timedelta
from typing import Any
from celery.signals import beat_init
from celery.signals import worker_init
from sqlalchemy.orm import Session
from danswer.background.celery.celery_app import celery_app
from danswer.background.task_utils import build_celery_task_wrapper
from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
from danswer.configs.constants import POSTGRES_CELERY_WORKER_APP_NAME
from danswer.db.chat import delete_chat_sessions_older_than
from danswer.db.engine import get_sqlalchemy_engine
from danswer.db.engine import init_sqlalchemy_engine
from danswer.server.settings.store import load_settings
from danswer.utils.logger import setup_logger
from danswer.utils.variable_functionality import global_version
from ee.danswer.background.celery_utils import should_perform_chat_ttl_check
from ee.danswer.background.celery_utils import should_sync_user_groups
from ee.danswer.background.task_name_builders import name_chat_ttl_task
from ee.danswer.background.task_name_builders import name_user_group_sync_task
from ee.danswer.db.user_group import fetch_user_groups
from ee.danswer.server.reporting.usage_export_generation import create_new_usage_report
from ee.danswer.user_groups.sync import sync_user_groups
logger = setup_logger()
@@ -30,17 +20,6 @@ logger = setup_logger()
global_version.set_ee()
@build_celery_task_wrapper(name_user_group_sync_task)
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
def sync_user_group_task(user_group_id: int) -> None:
with Session(get_sqlalchemy_engine()) as db_session:
# actual sync logic
try:
sync_user_groups(user_group_id=user_group_id, db_session=db_session)
except Exception as e:
logger.exception(f"Failed to sync user group - {e}")
@build_celery_task_wrapper(name_chat_ttl_task)
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
def perform_ttl_management_task(retention_limit_days: int) -> None:
@@ -51,8 +30,6 @@ def perform_ttl_management_task(retention_limit_days: int) -> None:
#####
# Periodic Tasks
#####
@celery_app.task(
name="check_ttl_management_task",
soft_time_limit=JOB_TIMEOUT,
@@ -69,24 +46,6 @@ def check_ttl_management_task() -> None:
)
@celery_app.task(
name="check_for_user_groups_sync_task",
soft_time_limit=JOB_TIMEOUT,
)
def check_for_user_groups_sync_task() -> None:
"""Runs periodically to check if any user groups are out of sync
Creates a task to sync the user group if needed"""
with Session(get_sqlalchemy_engine()) as db_session:
# check if any document sets are not synced
user_groups = fetch_user_groups(db_session=db_session, only_current=False)
for user_group in user_groups:
if should_sync_user_groups(user_group, db_session):
logger.info(f"User Group {user_group.id} is not synced. Syncing now!")
sync_user_group_task.apply_async(
kwargs=dict(user_group_id=user_group.id),
)
@celery_app.task(
name="autogenerate_usage_report_task",
soft_time_limit=JOB_TIMEOUT,
@@ -101,25 +60,11 @@ def autogenerate_usage_report_task() -> None:
)
@beat_init.connect
def on_beat_init(sender: Any, **kwargs: Any) -> None:
init_sqlalchemy_engine(POSTGRES_CELERY_BEAT_APP_NAME)
@worker_init.connect
def on_worker_init(sender: Any, **kwargs: Any) -> None:
init_sqlalchemy_engine(POSTGRES_CELERY_WORKER_APP_NAME)
#####
# Celery Beat (Periodic Tasks) Settings
#####
celery_app.conf.beat_schedule = {
"check-for-user-group-sync": {
"task": "check_for_user_groups_sync_task",
"schedule": timedelta(seconds=5),
},
"autogenerate_usage_report": {
"autogenerate-usage-report": {
"task": "autogenerate_usage_report_task",
"schedule": timedelta(days=30), # TODO: change this to config flag
},

View File

@@ -1,27 +1,13 @@
from sqlalchemy.orm import Session
from danswer.db.models import UserGroup
from danswer.db.tasks import check_task_is_live_and_not_timed_out
from danswer.db.tasks import get_latest_task
from danswer.utils.logger import setup_logger
from ee.danswer.background.task_name_builders import name_chat_ttl_task
from ee.danswer.background.task_name_builders import name_user_group_sync_task
logger = setup_logger()
def should_sync_user_groups(user_group: UserGroup, db_session: Session) -> bool:
if user_group.is_up_to_date:
return False
task_name = name_user_group_sync_task(user_group.id)
latest_sync = get_latest_task(task_name, db_session)
if latest_sync and check_task_is_live_and_not_timed_out(latest_sync, db_session):
logger.info("TTL check is already being performed. Skipping.")
return False
return True
def should_perform_chat_ttl_check(
retention_limit_days: int | None, db_session: Session
) -> bool:

View File

@@ -5,6 +5,7 @@ from uuid import UUID
from fastapi import HTTPException
from sqlalchemy import delete
from sqlalchemy import func
from sqlalchemy import Select
from sqlalchemy import select
from sqlalchemy import update
from sqlalchemy.orm import Session
@@ -81,10 +82,25 @@ def fetch_user_group(db_session: Session, user_group_id: int) -> UserGroup | Non
def fetch_user_groups(
db_session: Session, only_current: bool = True
db_session: Session, only_up_to_date: bool = True
) -> Sequence[UserGroup]:
"""
Fetches user groups from the database.
This function retrieves a sequence of `UserGroup` objects from the database.
If `only_up_to_date` is set to `True`, it filters the user groups to return only those
that are marked as up-to-date (`is_up_to_date` is `True`).
Args:
db_session (Session): The SQLAlchemy session used to query the database.
only_up_to_date (bool, optional): Flag to determine whether to filter the results
to include only up to date user groups. Defaults to `True`.
Returns:
Sequence[UserGroup]: A sequence of `UserGroup` objects matching the query criteria.
"""
stmt = select(UserGroup)
if only_current:
if only_up_to_date:
stmt = stmt.where(UserGroup.is_up_to_date == True) # noqa: E712
return db_session.scalars(stmt).all()
@@ -103,6 +119,42 @@ def fetch_user_groups_for_user(
return db_session.scalars(stmt).all()
def construct_document_select_by_usergroup(
user_group_id: int,
) -> Select:
"""This returns a statement that should be executed using
.yield_per() to minimize overhead. The primary consumers of this function
are background processing task generators."""
stmt = (
select(Document)
.join(
DocumentByConnectorCredentialPair,
Document.id == DocumentByConnectorCredentialPair.id,
)
.join(
ConnectorCredentialPair,
and_(
DocumentByConnectorCredentialPair.connector_id
== ConnectorCredentialPair.connector_id,
DocumentByConnectorCredentialPair.credential_id
== ConnectorCredentialPair.credential_id,
),
)
.join(
UserGroup__ConnectorCredentialPair,
UserGroup__ConnectorCredentialPair.cc_pair_id == ConnectorCredentialPair.id,
)
.join(
UserGroup,
UserGroup__ConnectorCredentialPair.user_group_id == UserGroup.id,
)
.where(UserGroup.id == user_group_id)
.order_by(Document.id)
)
stmt = stmt.distinct()
return stmt
def fetch_documents_for_user_group_paginated(
db_session: Session,
user_group_id: int,
@@ -361,6 +413,10 @@ def update_user_group(
user_group_id: int,
user_group_update: UserGroupUpdate,
) -> UserGroup:
"""If successful, this can set db_user_group.is_up_to_date = False.
That will be processed by check_for_vespa_user_groups_sync_task and trigger
a long running background sync to Vespa.
"""
stmt = select(UserGroup).where(UserGroup.id == user_group_id)
db_user_group = db_session.scalar(stmt)
if db_user_group is None:

View File

@@ -32,7 +32,7 @@ def list_user_groups(
db_session: Session = Depends(get_session),
) -> list[UserGroup]:
if user is None or user.role == UserRole.ADMIN:
user_groups = fetch_user_groups(db_session, only_current=False)
user_groups = fetch_user_groups(db_session, only_up_to_date=False)
else:
user_groups = fetch_user_groups_for_user(
db_session=db_session,

View File

@@ -1,87 +0,0 @@
from sqlalchemy.orm import Session
from danswer.access.access import get_access_for_documents
from danswer.db.document import prepare_to_modify_documents
from danswer.db.search_settings import get_current_search_settings
from danswer.db.search_settings import get_secondary_search_settings
from danswer.document_index.factory import get_default_document_index
from danswer.document_index.interfaces import DocumentIndex
from danswer.document_index.interfaces import UpdateRequest
from danswer.utils.logger import setup_logger
from ee.danswer.db.user_group import delete_user_group
from ee.danswer.db.user_group import fetch_documents_for_user_group_paginated
from ee.danswer.db.user_group import fetch_user_group
from ee.danswer.db.user_group import mark_user_group_as_synced
logger = setup_logger()
_SYNC_BATCH_SIZE = 100
def _sync_user_group_batch(
document_ids: list[str], document_index: DocumentIndex, db_session: Session
) -> None:
logger.debug(f"Syncing document sets for: {document_ids}")
# Acquires a lock on the documents so that no other process can modify them
with prepare_to_modify_documents(db_session=db_session, document_ids=document_ids):
# get current state of document sets for these documents
document_id_to_access = get_access_for_documents(
document_ids=document_ids, db_session=db_session
)
# update Vespa
document_index.update(
update_requests=[
UpdateRequest(
document_ids=[document_id],
access=document_id_to_access[document_id],
)
for document_id in document_ids
]
)
# Finish the transaction and release the locks
db_session.commit()
def sync_user_groups(user_group_id: int, db_session: Session) -> None:
"""Sync the status of Postgres for the specified user group"""
search_settings = get_current_search_settings(db_session)
secondary_search_settings = get_secondary_search_settings(db_session)
document_index = get_default_document_index(
primary_index_name=search_settings.index_name,
secondary_index_name=secondary_search_settings.index_name
if secondary_search_settings
else None,
)
user_group = fetch_user_group(db_session=db_session, user_group_id=user_group_id)
if user_group is None:
raise ValueError(f"User group '{user_group_id}' does not exist")
cursor = None
while True:
# NOTE: this may miss some documents, but that is okay. Any new documents added
# will be added with the correct group membership
document_batch, cursor = fetch_documents_for_user_group_paginated(
db_session=db_session,
user_group_id=user_group_id,
last_document_id=cursor,
limit=_SYNC_BATCH_SIZE,
)
_sync_user_group_batch(
document_ids=[document.id for document in document_batch],
document_index=document_index,
db_session=db_session,
)
if cursor is None:
break
if user_group.is_up_for_deletion:
delete_user_group(db_session=db_session, user_group=user_group)
else:
mark_user_group_as_synced(db_session=db_session, user_group=user_group)