|
|
|
@@ -1,9 +1,10 @@
|
|
|
|
|
import multiprocessing
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import time
|
|
|
|
|
import traceback
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from datetime import timezone
|
|
|
|
|
from enum import Enum
|
|
|
|
|
from http import HTTPStatus
|
|
|
|
|
from time import sleep
|
|
|
|
|
from typing import Any
|
|
|
|
@@ -15,6 +16,7 @@ from celery import Task
|
|
|
|
|
from celery.exceptions import SoftTimeLimitExceeded
|
|
|
|
|
from celery.result import AsyncResult
|
|
|
|
|
from celery.states import READY_STATES
|
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
from redis import Redis
|
|
|
|
|
from redis.lock import Lock as RedisLock
|
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
@@ -26,7 +28,9 @@ from onyx.background.celery.tasks.indexing.utils import get_unfenced_index_attem
|
|
|
|
|
from onyx.background.celery.tasks.indexing.utils import IndexingCallback
|
|
|
|
|
from onyx.background.celery.tasks.indexing.utils import try_creating_indexing_task
|
|
|
|
|
from onyx.background.celery.tasks.indexing.utils import validate_indexing_fences
|
|
|
|
|
from onyx.background.indexing.job_client import SimpleJob
|
|
|
|
|
from onyx.background.indexing.job_client import SimpleJobClient
|
|
|
|
|
from onyx.background.indexing.job_client import SimpleJobException
|
|
|
|
|
from onyx.background.indexing.run_indexing import run_indexing_entrypoint
|
|
|
|
|
from onyx.configs.app_configs import MANAGED_VESPA
|
|
|
|
|
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
|
|
|
|
@@ -70,6 +74,123 @@ from shared_configs.configs import SENTRY_DSN
|
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class IndexingWatchdogTerminalStatus(str, Enum):
|
|
|
|
|
"""The different statuses the watchdog can finish with.
|
|
|
|
|
|
|
|
|
|
TODO: create broader success/failure/abort categories
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
UNDEFINED = "undefined"
|
|
|
|
|
|
|
|
|
|
SUCCEEDED = "succeeded"
|
|
|
|
|
|
|
|
|
|
SPAWN_FAILED = "spawn_failed" # connector spawn failed
|
|
|
|
|
|
|
|
|
|
BLOCKED_BY_DELETION = "blocked_by_deletion"
|
|
|
|
|
BLOCKED_BY_STOP_SIGNAL = "blocked_by_stop_signal"
|
|
|
|
|
FENCE_NOT_FOUND = "fence_not_found" # fence does not exist
|
|
|
|
|
FENCE_READINESS_TIMEOUT = (
|
|
|
|
|
"fence_readiness_timeout" # fence exists but wasn't ready within the timeout
|
|
|
|
|
)
|
|
|
|
|
FENCE_MISMATCH = "fence_mismatch" # task and fence metadata mismatch
|
|
|
|
|
TASK_ALREADY_RUNNING = "task_already_running" # task appears to be running already
|
|
|
|
|
INDEX_ATTEMPT_MISMATCH = (
|
|
|
|
|
"index_attempt_mismatch" # expected index attempt metadata not found in db
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
CONNECTOR_EXCEPTIONED = "connector_exceptioned" # the connector itself exceptioned
|
|
|
|
|
WATCHDOG_EXCEPTIONED = "watchdog_exceptioned" # the watchdog exceptioned
|
|
|
|
|
|
|
|
|
|
# the watchdog received a termination signal
|
|
|
|
|
TERMINATED_BY_SIGNAL = "terminated_by_signal"
|
|
|
|
|
|
|
|
|
|
# the watchdog terminated the task due to no activity
|
|
|
|
|
TERMINATED_BY_ACTIVITY_TIMEOUT = "terminated_by_activity_timeout"
|
|
|
|
|
|
|
|
|
|
OUT_OF_MEMORY = "out_of_memory"
|
|
|
|
|
|
|
|
|
|
PROCESS_SIGNAL_SIGKILL = "process_signal_sigkill"
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def code(self) -> int:
|
|
|
|
|
_ENUM_TO_CODE: dict[IndexingWatchdogTerminalStatus, int] = {
|
|
|
|
|
IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL: -9,
|
|
|
|
|
IndexingWatchdogTerminalStatus.OUT_OF_MEMORY: 137,
|
|
|
|
|
IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION: 248,
|
|
|
|
|
IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL: 249,
|
|
|
|
|
IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND: 250,
|
|
|
|
|
IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT: 251,
|
|
|
|
|
IndexingWatchdogTerminalStatus.FENCE_MISMATCH: 252,
|
|
|
|
|
IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING: 253,
|
|
|
|
|
IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH: 254,
|
|
|
|
|
IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED: 255,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return _ENUM_TO_CODE[self]
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_code(cls, code: int) -> "IndexingWatchdogTerminalStatus":
|
|
|
|
|
_CODE_TO_ENUM: dict[int, IndexingWatchdogTerminalStatus] = {
|
|
|
|
|
-9: IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL,
|
|
|
|
|
248: IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION,
|
|
|
|
|
249: IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL,
|
|
|
|
|
250: IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND,
|
|
|
|
|
251: IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT,
|
|
|
|
|
252: IndexingWatchdogTerminalStatus.FENCE_MISMATCH,
|
|
|
|
|
253: IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING,
|
|
|
|
|
254: IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH,
|
|
|
|
|
255: IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if code in _CODE_TO_ENUM:
|
|
|
|
|
return _CODE_TO_ENUM[code]
|
|
|
|
|
|
|
|
|
|
return IndexingWatchdogTerminalStatus.UNDEFINED
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SimpleJobResult:
|
|
|
|
|
"""The data we want to have when the watchdog finishes"""
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self.status = IndexingWatchdogTerminalStatus.UNDEFINED
|
|
|
|
|
self.connector_source = None
|
|
|
|
|
self.exit_code = None
|
|
|
|
|
self.exception_str = None
|
|
|
|
|
|
|
|
|
|
status: IndexingWatchdogTerminalStatus
|
|
|
|
|
connector_source: str | None
|
|
|
|
|
exit_code: int | None
|
|
|
|
|
exception_str: str | None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConnectorIndexingContext(BaseModel):
|
|
|
|
|
tenant_id: str | None
|
|
|
|
|
cc_pair_id: int
|
|
|
|
|
search_settings_id: int
|
|
|
|
|
index_attempt_id: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConnectorIndexingLogBuilder:
|
|
|
|
|
def __init__(self, ctx: ConnectorIndexingContext):
|
|
|
|
|
self.ctx = ctx
|
|
|
|
|
|
|
|
|
|
def build(self, msg: str, **kwargs: Any) -> str:
|
|
|
|
|
msg_final = (
|
|
|
|
|
f"{msg}: "
|
|
|
|
|
f"tenant_id={self.ctx.tenant_id} "
|
|
|
|
|
f"attempt={self.ctx.index_attempt_id} "
|
|
|
|
|
f"cc_pair={self.ctx.cc_pair_id} "
|
|
|
|
|
f"search_settings={self.ctx.search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Append extra keyword arguments in logfmt style
|
|
|
|
|
if kwargs:
|
|
|
|
|
extra_logfmt = " ".join(f"{key}={value}" for key, value in kwargs.items())
|
|
|
|
|
msg_final = f"{msg_final} {extra_logfmt}"
|
|
|
|
|
|
|
|
|
|
return msg_final
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def monitor_ccpair_indexing_taskset(
|
|
|
|
|
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
|
|
|
|
) -> None:
|
|
|
|
@@ -496,7 +617,6 @@ def connector_indexing_task(
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
attempt_found = False
|
|
|
|
|
n_final_progress: int | None = None
|
|
|
|
|
|
|
|
|
|
# 20 is the documented default for httpx max_keepalive_connections
|
|
|
|
@@ -513,19 +633,21 @@ def connector_indexing_task(
|
|
|
|
|
r = get_redis_client(tenant_id=tenant_id)
|
|
|
|
|
|
|
|
|
|
if redis_connector.delete.fenced:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Indexing will not start because connector deletion is in progress: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"fence={redis_connector.delete.fence_key}"
|
|
|
|
|
f"fence={redis_connector.delete.fence_key}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if redis_connector.stop.fenced:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Indexing will not start because a connector stop signal was detected: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"fence={redis_connector.stop.fence_key}"
|
|
|
|
|
f"fence={redis_connector.stop.fence_key}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# this wait is needed to avoid a race condition where
|
|
|
|
@@ -534,19 +656,24 @@ def connector_indexing_task(
|
|
|
|
|
start = time.monotonic()
|
|
|
|
|
while True:
|
|
|
|
|
if time.monotonic() - start > CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"connector_indexing_task - timed out waiting for fence to be ready: "
|
|
|
|
|
f"fence={redis_connector.permissions.fence_key}"
|
|
|
|
|
f"fence={redis_connector.permissions.fence_key}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not redis_connector_index.fenced: # The fence must exist
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}"
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
payload = redis_connector_index.payload # The payload must exist
|
|
|
|
|
if not payload:
|
|
|
|
|
raise ValueError("connector_indexing_task: payload invalid or not found")
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
"connector_indexing_task: payload invalid or not found",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if payload.index_attempt_id is None or payload.celery_task_id is None:
|
|
|
|
|
logger.info(
|
|
|
|
@@ -556,10 +683,11 @@ def connector_indexing_task(
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if payload.index_attempt_id != index_attempt_id:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"connector_indexing_task - id mismatch. Task may be left over from previous run.: "
|
|
|
|
|
f"task_index_attempt={index_attempt_id} "
|
|
|
|
|
f"payload_index_attempt={payload.index_attempt_id}"
|
|
|
|
|
f"payload_index_attempt={payload.index_attempt_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.FENCE_MISMATCH.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
@@ -583,7 +711,14 @@ def connector_indexing_task(
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Indexing task already running, exiting...: "
|
|
|
|
|
f"index_attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
payload.started = datetime.now(timezone.utc)
|
|
|
|
|
redis_connector_index.set_fence(payload)
|
|
|
|
@@ -592,10 +727,10 @@ def connector_indexing_task(
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
attempt = get_index_attempt(db_session, index_attempt_id)
|
|
|
|
|
if not attempt:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Index attempt not found: index_attempt={index_attempt_id}"
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Index attempt not found: index_attempt={index_attempt_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
|
|
|
|
|
)
|
|
|
|
|
attempt_found = True
|
|
|
|
|
|
|
|
|
|
cc_pair = get_connector_credential_pair_from_id(
|
|
|
|
|
db_session=db_session,
|
|
|
|
@@ -603,16 +738,21 @@ def connector_indexing_task(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not cc_pair:
|
|
|
|
|
raise ValueError(f"cc_pair not found: cc_pair={cc_pair_id}")
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"cc_pair not found: cc_pair={cc_pair_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not cc_pair.connector:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Connector not found: cc_pair={cc_pair_id} connector={cc_pair.connector_id}"
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Connector not found: cc_pair={cc_pair_id} connector={cc_pair.connector_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not cc_pair.credential:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Credential not found: cc_pair={cc_pair_id} credential={cc_pair.credential_id}"
|
|
|
|
|
raise SimpleJobException(
|
|
|
|
|
f"Credential not found: cc_pair={cc_pair_id} credential={cc_pair.credential_id}",
|
|
|
|
|
code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# define a callback class
|
|
|
|
@@ -650,20 +790,6 @@ def connector_indexing_task(
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
if attempt_found:
|
|
|
|
|
try:
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
mark_attempt_failed(
|
|
|
|
|
index_attempt_id, db_session, failure_reason=str(e)
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.exception(
|
|
|
|
|
"Indexing watchdog - transient exception looking up index attempt: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
raise e
|
|
|
|
|
finally:
|
|
|
|
@@ -678,41 +804,49 @@ def connector_indexing_task(
|
|
|
|
|
return n_final_progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def connector_indexing_task_wrapper(
|
|
|
|
|
index_attempt_id: int,
|
|
|
|
|
cc_pair_id: int,
|
|
|
|
|
search_settings_id: int,
|
|
|
|
|
tenant_id: str | None,
|
|
|
|
|
is_ee: bool,
|
|
|
|
|
) -> int | None:
|
|
|
|
|
"""Just wraps connector_indexing_task so we can log any exceptions before
|
|
|
|
|
re-raising it."""
|
|
|
|
|
result: int | None = None
|
|
|
|
|
def process_job_result(
|
|
|
|
|
job: SimpleJob,
|
|
|
|
|
connector_source: str | None,
|
|
|
|
|
redis_connector_index: RedisConnectorIndex,
|
|
|
|
|
log_builder: ConnectorIndexingLogBuilder,
|
|
|
|
|
) -> SimpleJobResult:
|
|
|
|
|
result = SimpleJobResult()
|
|
|
|
|
result.connector_source = connector_source
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
result = connector_indexing_task(
|
|
|
|
|
index_attempt_id,
|
|
|
|
|
cc_pair_id,
|
|
|
|
|
search_settings_id,
|
|
|
|
|
tenant_id,
|
|
|
|
|
is_ee,
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.exception(
|
|
|
|
|
f"connector_indexing_task exceptioned: "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"index_attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
if job.process:
|
|
|
|
|
result.exit_code = job.process.exitcode
|
|
|
|
|
|
|
|
|
|
# There is a cloud related bug outside of our code
|
|
|
|
|
# where spawned tasks return with an exit code of 1.
|
|
|
|
|
# Unfortunately, exceptions also return with an exit code of 1,
|
|
|
|
|
# so just raising an exception isn't informative
|
|
|
|
|
# Exiting with 255 makes it possible to distinguish between normal exits
|
|
|
|
|
# and exceptions.
|
|
|
|
|
sys.exit(255)
|
|
|
|
|
if job.status != "error":
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
ignore_exitcode = False
|
|
|
|
|
|
|
|
|
|
# In EKS, there is an edge case where successful tasks return exit
|
|
|
|
|
# code 1 in the cloud due to the set_spawn_method not sticking.
|
|
|
|
|
# We've since worked around this, but the following is a safe way to
|
|
|
|
|
# work around this issue. Basically, we ignore the job error state
|
|
|
|
|
# if the completion signal is OK.
|
|
|
|
|
status_int = redis_connector_index.get_completion()
|
|
|
|
|
if status_int:
|
|
|
|
|
status_enum = HTTPStatus(status_int)
|
|
|
|
|
if status_enum == HTTPStatus.OK:
|
|
|
|
|
ignore_exitcode = True
|
|
|
|
|
|
|
|
|
|
if ignore_exitcode:
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
|
|
|
|
|
task_logger.warning(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - spawned task has non-zero exit code "
|
|
|
|
|
"but completion signal is OK. Continuing...",
|
|
|
|
|
exit_code=str(result.exit_code),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
if result.exit_code is not None:
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.from_code(result.exit_code)
|
|
|
|
|
|
|
|
|
|
result.exception_str = job.exception()
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
@@ -730,12 +864,32 @@ def connector_indexing_proxy_task(
|
|
|
|
|
search_settings_id: int,
|
|
|
|
|
tenant_id: str | None,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""celery tasks are forked, but forking is unstable. This proxies work to a spawned task."""
|
|
|
|
|
"""celery out of process task execution strategy is pool=prefork, but it uses fork,
|
|
|
|
|
and forking is inherently unstable.
|
|
|
|
|
|
|
|
|
|
To work around this, we use pool=threads and proxy our work to a spawned task.
|
|
|
|
|
|
|
|
|
|
TODO(rkuo): refactor this so that there is a single return path where we canonically
|
|
|
|
|
log the result of running this function.
|
|
|
|
|
"""
|
|
|
|
|
start = time.monotonic()
|
|
|
|
|
|
|
|
|
|
result = SimpleJobResult()
|
|
|
|
|
|
|
|
|
|
ctx = ConnectorIndexingContext(
|
|
|
|
|
tenant_id=tenant_id,
|
|
|
|
|
cc_pair_id=cc_pair_id,
|
|
|
|
|
search_settings_id=search_settings_id,
|
|
|
|
|
index_attempt_id=index_attempt_id,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
log_builder = ConnectorIndexingLogBuilder(ctx)
|
|
|
|
|
|
|
|
|
|
task_logger.info(
|
|
|
|
|
f"Indexing watchdog - starting: attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id} "
|
|
|
|
|
f"mp_start_method={multiprocessing.get_start_method()}"
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - starting",
|
|
|
|
|
mp_start_method=str(multiprocessing.get_start_method()),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not self.request.id:
|
|
|
|
@@ -744,7 +898,7 @@ def connector_indexing_proxy_task(
|
|
|
|
|
client = SimpleJobClient()
|
|
|
|
|
|
|
|
|
|
job = client.submit(
|
|
|
|
|
connector_indexing_task_wrapper,
|
|
|
|
|
connector_indexing_task,
|
|
|
|
|
index_attempt_id,
|
|
|
|
|
cc_pair_id,
|
|
|
|
|
search_settings_id,
|
|
|
|
@@ -754,22 +908,33 @@ def connector_indexing_proxy_task(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not job:
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.SPAWN_FAILED
|
|
|
|
|
task_logger.info(
|
|
|
|
|
f"Indexing watchdog - spawn failed: attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - finished",
|
|
|
|
|
status=str(result.status.value),
|
|
|
|
|
exit_code=str(result.exit_code),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
task_logger.info(
|
|
|
|
|
f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
task_logger.info(log_builder.build("Indexing watchdog - spawn succeeded"))
|
|
|
|
|
|
|
|
|
|
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
|
|
|
|
redis_connector_index = redis_connector.new_index(search_settings_id)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
index_attempt = get_index_attempt(
|
|
|
|
|
db_session=db_session, index_attempt_id=index_attempt_id
|
|
|
|
|
)
|
|
|
|
|
if not index_attempt:
|
|
|
|
|
raise RuntimeError("Index attempt not found")
|
|
|
|
|
|
|
|
|
|
result.connector_source = (
|
|
|
|
|
index_attempt.connector_credential_pair.connector.source.value
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
sleep(5)
|
|
|
|
|
|
|
|
|
@@ -781,81 +946,27 @@ def connector_indexing_proxy_task(
|
|
|
|
|
|
|
|
|
|
# if the job is done, clean up and break
|
|
|
|
|
if job.done():
|
|
|
|
|
exit_code: int | None
|
|
|
|
|
try:
|
|
|
|
|
if job.status == "error":
|
|
|
|
|
ignore_exitcode = False
|
|
|
|
|
|
|
|
|
|
exit_code = None
|
|
|
|
|
if job.process:
|
|
|
|
|
exit_code = job.process.exitcode
|
|
|
|
|
|
|
|
|
|
# seeing odd behavior where spawned tasks usually return exit code 1 in the cloud,
|
|
|
|
|
# even though logging clearly indicates successful completion
|
|
|
|
|
# to work around this, we ignore the job error state if the completion signal is OK
|
|
|
|
|
status_int = redis_connector_index.get_completion()
|
|
|
|
|
if status_int:
|
|
|
|
|
status_enum = HTTPStatus(status_int)
|
|
|
|
|
if status_enum == HTTPStatus.OK:
|
|
|
|
|
ignore_exitcode = True
|
|
|
|
|
|
|
|
|
|
if not ignore_exitcode:
|
|
|
|
|
raise RuntimeError("Spawned task exceptioned.")
|
|
|
|
|
|
|
|
|
|
task_logger.warning(
|
|
|
|
|
"Indexing watchdog - spawned task has non-zero exit code "
|
|
|
|
|
"but completion signal is OK. Continuing...: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id} "
|
|
|
|
|
f"exit_code={exit_code}"
|
|
|
|
|
result = process_job_result(
|
|
|
|
|
job, result.connector_source, redis_connector_index, log_builder
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
task_logger.error(
|
|
|
|
|
"Indexing watchdog - spawned task exceptioned: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id} "
|
|
|
|
|
f"exit_code={exit_code} "
|
|
|
|
|
f"error={job.exception()}"
|
|
|
|
|
task_logger.exception(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - spawned task exceptioned"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
raise
|
|
|
|
|
finally:
|
|
|
|
|
job.release()
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# if a termination signal is detected, clean up and break
|
|
|
|
|
if self.request.id and redis_connector_index.terminating(self.request.id):
|
|
|
|
|
task_logger.warning(
|
|
|
|
|
"Indexing watchdog - termination signal detected: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
log_builder.build("Indexing watchdog - termination signal detected")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
mark_attempt_canceled(
|
|
|
|
|
index_attempt_id,
|
|
|
|
|
db_session,
|
|
|
|
|
"Connector termination signal detected",
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
# if the DB exceptions, we'll just get an unfriendly failure message
|
|
|
|
|
# in the UI instead of the cancellation message
|
|
|
|
|
logger.exception(
|
|
|
|
|
"Indexing watchdog - transient exception marking index attempt as canceled: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
job.cancel()
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# if the spawned task is still running, restart the check once again
|
|
|
|
@@ -874,19 +985,87 @@ def connector_indexing_proxy_task(
|
|
|
|
|
except Exception:
|
|
|
|
|
# if the DB exceptioned, just restart the check.
|
|
|
|
|
# polling the index attempt status doesn't need to be strongly consistent
|
|
|
|
|
logger.exception(
|
|
|
|
|
"Indexing watchdog - transient exception looking up index attempt: "
|
|
|
|
|
f"attempt={index_attempt_id} "
|
|
|
|
|
f"tenant={tenant_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
task_logger.exception(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - transient exception looking up index attempt"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
except Exception:
|
|
|
|
|
result.status = IndexingWatchdogTerminalStatus.WATCHDOG_EXCEPTIONED
|
|
|
|
|
result.exception_str = traceback.format_exc()
|
|
|
|
|
|
|
|
|
|
# handle exit and reporting
|
|
|
|
|
elapsed = time.monotonic() - start
|
|
|
|
|
if result.exception_str is not None:
|
|
|
|
|
# print with exception
|
|
|
|
|
try:
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
failure_reason = (
|
|
|
|
|
f"Spawned task exceptioned: exit_code={result.exit_code}"
|
|
|
|
|
)
|
|
|
|
|
mark_attempt_failed(
|
|
|
|
|
ctx.index_attempt_id,
|
|
|
|
|
db_session,
|
|
|
|
|
failure_reason=failure_reason,
|
|
|
|
|
full_exception_trace=result.exception_str,
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
task_logger.exception(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - transient exception marking index attempt as failed"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
normalized_exception_str = "None"
|
|
|
|
|
if result.exception_str:
|
|
|
|
|
normalized_exception_str = result.exception_str.replace(
|
|
|
|
|
"\n", "\\n"
|
|
|
|
|
).replace('"', '\\"')
|
|
|
|
|
|
|
|
|
|
task_logger.warning(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - finished",
|
|
|
|
|
source=result.connector_source,
|
|
|
|
|
status=result.status.value,
|
|
|
|
|
exit_code=str(result.exit_code),
|
|
|
|
|
exception=f'"{normalized_exception_str}"',
|
|
|
|
|
elapsed=f"{elapsed:.2f}s",
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
redis_connector_index.set_watchdog(False)
|
|
|
|
|
task_logger.info(
|
|
|
|
|
f"Indexing watchdog - finished: attempt={index_attempt_id} "
|
|
|
|
|
f"cc_pair={cc_pair_id} "
|
|
|
|
|
f"search_settings={search_settings_id}"
|
|
|
|
|
raise RuntimeError(f"Exception encountered: traceback={result.exception_str}")
|
|
|
|
|
|
|
|
|
|
# print without exception
|
|
|
|
|
if result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL:
|
|
|
|
|
try:
|
|
|
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
|
|
|
mark_attempt_canceled(
|
|
|
|
|
index_attempt_id,
|
|
|
|
|
db_session,
|
|
|
|
|
"Connector termination signal detected",
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
# if the DB exceptions, we'll just get an unfriendly failure message
|
|
|
|
|
# in the UI instead of the cancellation message
|
|
|
|
|
task_logger.exception(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - transient exception marking index attempt as canceled"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
job.cancel()
|
|
|
|
|
|
|
|
|
|
task_logger.info(
|
|
|
|
|
log_builder.build(
|
|
|
|
|
"Indexing watchdog - finished",
|
|
|
|
|
source=result.connector_source,
|
|
|
|
|
status=str(result.status.value),
|
|
|
|
|
exit_code=str(result.exit_code),
|
|
|
|
|
elapsed=f"{elapsed:.2f}s",
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
redis_connector_index.set_watchdog(False)
|
|
|
|
|
return
|
|
|
|
|