fix(scheduler): hard-timeout heavy jobs + periodic stuck-run reaper
At the shared 05:00 anchor all heavy jobs fire together; tpdb/stashdb/performer-driven had no timeout, so a hung connector blocked the whole job and — with max_instances=1 — blocked every future fire of that job until a worker restart (incident 2026-06-02: 6 runs hung 8.7h, movie mirrors 47h stale, tube ingest stalled). - _run_with_timeout wraps tpdb/stashdb/performer-driven in a 30-min hard cap (same ThreadPoolExecutor pattern movie-ingest already uses): on timeout the job returns and frees the scheduler slot; the orphaned thread lives until restart. - _job_reap_stuck: hourly reaper of 'running' >2h rows, registered in the scheduler — the startup-only reaper missed hangs while the worker stayed up for hours. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
24fc790691
commit
08f901712c
1 changed files with 63 additions and 13 deletions
|
|
@ -46,31 +46,65 @@ log = logging.getLogger(__name__)
|
||||||
INTERVAL_ANCHOR = datetime(2026, 1, 1, 5, 0, tzinfo=timezone.utc)
|
INTERVAL_ANCHOR = datetime(2026, 1, 1, 5, 0, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
# Hard-timeout dla jobów robiących zewnętrzne HTTP (tpdb/stashdb/performer-driven).
|
||||||
|
# Bez tego zawis connectora bez własnego timeoutu blokował job na wiele godzin, a
|
||||||
|
# `max_instances=1` blokował KOLEJNE fire'y do restartu workera (incident 2026-06-02:
|
||||||
|
# 6 runów wisiało 8.7h od wspólnego anchora 05:00). Po timeoucie job WRACA (slot się
|
||||||
|
# zwalnia → następny fire leci); osierocony wątek dożywa do restartu (jak movie-ingest),
|
||||||
|
# a jego 'running' DB-row sprząta periodic reaper (_job_reap_stuck).
|
||||||
|
_JOB_TIMEOUT_SEC = 1800 # 30 min — healthy tpdb/stashdb delta to minuty (po SQL-phash), performer-driven top-N ~10-20 min.
|
||||||
|
|
||||||
|
|
||||||
|
def _run_with_timeout(fn, *, label: str, timeout_sec: int = _JOB_TIMEOUT_SEC) -> None:
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from concurrent.futures import TimeoutError as FutureTimeout
|
||||||
|
|
||||||
|
ex = ThreadPoolExecutor(max_workers=1)
|
||||||
|
try:
|
||||||
|
fut = ex.submit(fn)
|
||||||
|
try:
|
||||||
|
fut.result(timeout=timeout_sec)
|
||||||
|
except FutureTimeout:
|
||||||
|
log.error(
|
||||||
|
"[scheduler] %s HUNG > %ds — zwalniam slot, orphan thread dożyje do restartu",
|
||||||
|
label, timeout_sec,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
log.exception("[scheduler] %s job failed", label)
|
||||||
|
finally:
|
||||||
|
ex.shutdown(wait=False)
|
||||||
|
|
||||||
|
|
||||||
def _job_tpdb() -> None:
|
def _job_tpdb() -> None:
|
||||||
log.info("[scheduler] tpdb delta starting")
|
log.info("[scheduler] tpdb delta starting")
|
||||||
try:
|
_run_with_timeout(lambda: ingest_from_connector(TPDBConnector(), use_delta=True), label="tpdb")
|
||||||
ingest_from_connector(TPDBConnector(), use_delta=True)
|
|
||||||
except Exception:
|
|
||||||
log.exception("[scheduler] tpdb job failed")
|
|
||||||
|
|
||||||
|
|
||||||
def _job_stashdb() -> None:
|
def _job_stashdb() -> None:
|
||||||
log.info("[scheduler] stashdb delta starting")
|
log.info("[scheduler] stashdb delta starting")
|
||||||
try:
|
_run_with_timeout(lambda: ingest_from_connector(StashDBConnector(), use_delta=True), label="stashdb")
|
||||||
ingest_from_connector(StashDBConnector(), use_delta=True)
|
|
||||||
except Exception:
|
|
||||||
log.exception("[scheduler] stashdb job failed")
|
|
||||||
|
|
||||||
|
|
||||||
def _job_performer_driven(top_n: int) -> None:
|
def _job_performer_driven(top_n: int) -> None:
|
||||||
log.info("[scheduler] performer-driven top-%d starting", top_n)
|
log.info("[scheduler] performer-driven top-%d starting", top_n)
|
||||||
try:
|
_run_with_timeout(
|
||||||
run_performer_driven(
|
lambda: run_performer_driven(top_n=top_n, per_performer_limit=50),
|
||||||
top_n=top_n,
|
label="performer-driven",
|
||||||
per_performer_limit=50,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _job_reap_stuck() -> None:
|
||||||
|
"""Periodic reaper — czyści ingest_runs wiszące w 'running' >2h (zombie po zawisach
|
||||||
|
connectorów / kill mid-run). Startup-only reaper nie łapał ich gdy worker długo żył
|
||||||
|
(incident 2026-06-02: zombie wisiały 8.7h). Delayed import — unika cyklu z worker.py."""
|
||||||
|
try:
|
||||||
|
from app.scheduler.worker import reap_stuck_ingest_runs
|
||||||
|
|
||||||
|
reaped = reap_stuck_ingest_runs()
|
||||||
|
if reaped:
|
||||||
|
log.warning("[scheduler] periodic reaper: %d stuck ingest_runs", reaped)
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("[scheduler] performer-driven job failed")
|
log.exception("[scheduler] periodic reaper failed")
|
||||||
|
|
||||||
|
|
||||||
def _job_browse_latest(max_pages: int) -> None:
|
def _job_browse_latest(max_pages: int) -> None:
|
||||||
|
|
@ -300,6 +334,20 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
||||||
seconds, refresh_days,
|
seconds, refresh_days,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Periodic reaper — czyści zombie 'running' runy co godzinę. Domyślnie ZAWSZE on
|
||||||
|
# (cfg.get(...,1)), bo startup-only reaper nie łapie zawisów gdy worker długo żyje.
|
||||||
|
reap_hours = cfg.get("reap_stuck_hours", 1)
|
||||||
|
if reap_hours:
|
||||||
|
sched.add_job(
|
||||||
|
_job_reap_stuck,
|
||||||
|
IntervalTrigger(hours=reap_hours, start_date=INTERVAL_ANCHOR),
|
||||||
|
id="reap_stuck",
|
||||||
|
replace_existing=True,
|
||||||
|
max_instances=1,
|
||||||
|
coalesce=True,
|
||||||
|
)
|
||||||
|
log.info("scheduler: reap-stuck every %dh", reap_hours)
|
||||||
|
|
||||||
return sched
|
return sched
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -327,4 +375,6 @@ DEFAULT_CONFIG: dict[str, Any] = {
|
||||||
# Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers|
|
# Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers|
|
||||||
# /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia.
|
# /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia.
|
||||||
"taxonomy_counts_hours": 3,
|
"taxonomy_counts_hours": 3,
|
||||||
|
# Periodic reaper zombie 'running' runów — co 1h (próg 'running'>2h w funkcji).
|
||||||
|
"reap_stuck_hours": 1,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue