goon/app/scheduler/jobs.py
jtrzupek f014a901de feat(scheduler): periodic title+duration dedup (missing-merge tube dupes)
Missing-merge duplicates (same performer + identical normalized title + identical duration-to-the-second) that bulk_dedup misses — tube re-scrapes and cross-tube re-ingests like porn00 pulling a video already present from xnxx (reports 28fe8181/32df33b1). Extracted the proven merge_exact_title_duration logic into app/scheduler/title_duration_dedup.py (script now a thin wrapper), wired a 12h scheduler job (playback-only = what users actually see, GOON_SCHED_TITLE_DEDUP_HOURS). Signal is near-certain (two different videos don't share byte-identical title AND exact duration); no shared performer = not merged (over-match guard). Verified: job registers (jobs=14), backlog currently 0 after the one-shot global merge.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 11:20:48 +02:00

594 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""APScheduler job definitions dla worker'a (M5).
Domyślny harmonogram:
- tpdb — co 6h, delta od ostatniego successful run
- stashdb — co 6h, delta
- performer-driven — co 12h, top-N performerów z bazy (auto-discovers nowe sceny przez
ALL_DIRECT_SCRAPERS — 25 tube'ów per-tube HTTP scraping)
- performer-continuous — tick co N sekund, 1 performer per tick (ORDER BY last_searched_at)
Konfigurację (interwały, włącz/wyłącz) można nadpisać przez env (`GOON_SCHED_*`),
patrz `app/scheduler/config.py`.
Uwaga: APScheduler in-process (BlockingScheduler) — wystarczy dla self-hosted single
worker. Dla multi-worker trzebaby Redis/SQLAlchemy job store + distributed lock.
"""
from __future__ import annotations
import logging
from datetime import datetime, timezone
from typing import Any
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
from app.connectors import get_movie_connectors
from app.connectors.stashdb import StashDBConnector
from app.connectors.tpdb import TPDBConnector
from app.ingest import ingest_from_connector, ingest_movies_from_connector
from app.scheduler.browse_latest import run_browse_latest
from app.scheduler.performer_driven import run_continuous_one_at_a_time, run_performer_driven
log = logging.getLogger(__name__)
# Stała "epoka" dla IntervalTrigger.start_date — kotwica siatki fire-times.
# Bez start_date APScheduler liczy next_run_time = add_job_time + interval, więc każdy
# restart workera (a tych jest dużo — manual deploys, OOM, obraz przebudowany) odsuwa
# kolejny fire o pełen interval. Bug-reporty 2026-05-19 (`93d3c485` "brak freshporno")
# i 2026-05-23 (`2fbf1c73` "Czemu nie ma nowych filmów?") to dokładnie ten case:
# worker restartowany 15× w ciągu 3 dni → movie_ingest (24h) nigdy nie odpalił po
# 2026-05-20 05:29.
#
# Ze stałym start_date w przeszłości next_run_time leży na siatce co N godzin od tej
# kotwicy → restart workera nie zmienia kiedy następny fire. 05:00 UTC = 07:00 PL,
# niski ruch, bez kolizji z ręcznymi deployami w godzinach pracy.
INTERVAL_ANCHOR = datetime(2026, 1, 1, 5, 0, tzinfo=timezone.utc)
# Hard-timeout dla jobów robiących zewnętrzne HTTP (tpdb/stashdb/performer-driven).
# Bez tego zawis connectora bez własnego timeoutu blokował job na wiele godzin, a
# `max_instances=1` blokował KOLEJNE fire'y do restartu workera (incident 2026-06-02:
# 6 runów wisiało 8.7h od wspólnego anchora 05:00). Po timeoucie job WRACA (slot się
# zwalnia → następny fire leci); osierocony wątek dożywa do restartu (jak movie-ingest),
# a jego 'running' DB-row sprząta periodic reaper (_job_reap_stuck).
_JOB_TIMEOUT_SEC = 1800 # 30 min — healthy tpdb/stashdb delta to minuty (po SQL-phash), performer-driven top-N ~10-20 min.
def _run_with_timeout(fn, *, label: str, timeout_sec: int = _JOB_TIMEOUT_SEC) -> None:
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import TimeoutError as FutureTimeout
ex = ThreadPoolExecutor(max_workers=1)
try:
fut = ex.submit(fn)
try:
fut.result(timeout=timeout_sec)
except FutureTimeout:
log.error(
"[scheduler] %s HUNG > %ds — zwalniam slot, orphan thread dożyje do restartu",
label, timeout_sec,
)
except Exception:
log.exception("[scheduler] %s job failed", label)
finally:
ex.shutdown(wait=False)
def _job_tpdb() -> None:
log.info("[scheduler] tpdb delta starting")
_run_with_timeout(lambda: ingest_from_connector(TPDBConnector(), use_delta=True), label="tpdb")
def _job_stashdb() -> None:
log.info("[scheduler] stashdb delta starting")
_run_with_timeout(lambda: ingest_from_connector(StashDBConnector(), use_delta=True), label="stashdb")
def _job_performer_driven(top_n: int) -> None:
log.info("[scheduler] performer-driven top-%d starting", top_n)
_run_with_timeout(
lambda: run_performer_driven(top_n=top_n, per_performer_limit=50),
label="performer-driven",
)
def _job_reap_stuck() -> None:
"""Periodic reaper — czyści ingest_runs wiszące w 'running' >2h (zombie po zawisach
connectorów / kill mid-run). Startup-only reaper nie łapał ich gdy worker długo żył
(incident 2026-06-02: zombie wisiały 8.7h). Delayed import — unika cyklu z worker.py."""
try:
from app.scheduler.worker import reap_stuck_ingest_runs
reaped = reap_stuck_ingest_runs()
if reaped:
log.warning("[scheduler] periodic reaper: %d stuck ingest_runs", reaped)
except Exception:
log.exception("[scheduler] periodic reaper failed")
def _job_deep_crawl(pages_per_run: int) -> None:
"""Deep-crawl pełnych katalogów browse-tube'ów (Faza 2a — ingest-all). Round-robin
po sitetagu, wznawialny kursor (app/scheduler/deep_crawl.py). Hard-timeout 1h."""
log.info("[scheduler] deep-crawl starting (pages_per_run=%d)", pages_per_run)
from app.scheduler.deep_crawl import run_deep_crawl
_run_with_timeout(
lambda: run_deep_crawl(pages_per_run=pages_per_run),
label="deep-crawl",
timeout_sec=3600,
)
def _job_browse_latest(max_pages: int) -> None:
"""Browse-latest — scrap newest scenes z rich-metadata tubes (shyfap + ...).
Komplementarny do performer-driven: forward-fill (new scenes) vs backward (known performers).
"""
log.info("[scheduler] browse-latest starting (max_pages=%d)", max_pages)
try:
run_browse_latest(max_pages=max_pages)
except Exception:
log.exception("[scheduler] browse-latest job failed")
def _job_movie_ingest() -> None:
"""Movies ingest — paradisehill (primary) + dooplay mirrory.
Paradisehill jako primary daje canonical movie record (title + year + studio).
Mirrory dooplay (mangoporn/streamporn/pandamovies) doklejają playback sources
z native-friendly origins (mangoporn:luluvid, :voe, etc.) — `extract_stream_from_hoster`
rozwiązuje je do bezpośredniego stream URL → mobile gra natywnie zamiast WebView.
Matching mirror→primary movie idzie przez `resolve_movie` (title+year+studio
similarity). Każdy connector osobny IngestRun + delta od ostatniego success.
Kolejność: paradisehill FIRST (żeby mirrory miały do czego się przykleić),
potem mirrory. Pojedynczy failed connector NIE zatrzymuje pozostałych —
każdy w osobnym try/except.
HARD TIMEOUT per-connector (bug-report 2026-05-30 "ingest znów się zawiesił"):
sam try/except chroni przed *wyjątkiem*, ale NIE przed *hangiem* (CPU-bound
ReDoS na patologicznej stronie / thread-stall) — wtedy jeden mirror blokuje
resztę i mangoporn (jedyny z realnym new-content) nigdy nie startuje.
Każdy connector leci w osobnym wątku z `future.result(timeout)`; po
przekroczeniu logujemy i idziemy dalej (osierocony wątek dożywa do restartu
workera — OK, bo loop się odblokowuje). Healthy run ~50s, cap 6 min = zapas.
"""
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeout
PER_CONNECTOR_TIMEOUT = 360 # sekundy
for name, cls in get_movie_connectors():
log.info("[scheduler] movie ingest %s starting", name)
try:
ex = ThreadPoolExecutor(max_workers=1)
fut = ex.submit(ingest_movies_from_connector, cls(), use_delta=True)
try:
fut.result(timeout=PER_CONNECTOR_TIMEOUT)
except FutureTimeout:
log.error(
"[scheduler] movie ingest %s HUNG > %ds — skip, kolejka leci dalej",
name, PER_CONNECTOR_TIMEOUT,
)
finally:
# shutdown(wait=False): nie blokuj na join osieroconego wątku.
ex.shutdown(wait=False)
except Exception:
log.exception("[scheduler] movie ingest %s failed", name)
def _job_refresh_taxonomy_counts() -> None:
"""Przelicza denormalizowane scene_count na tags/performers/studios.
Hot-path /tags|/performers|/studios|/favorites czyta gotową kolumnę zamiast
agregować 6.3M scene_tags per-request (~4.3s → <20ms). Patrz migracja 0019 +
app/scheduler/taxonomy_counts.py.
"""
log.info("[scheduler] taxonomy counts refresh starting")
try:
from app.scheduler.taxonomy_counts import refresh_taxonomy_counts
changed = refresh_taxonomy_counts()
log.info("[scheduler] taxonomy counts refresh done: %s", changed)
except Exception:
log.exception("[scheduler] taxonomy counts refresh failed")
def _job_refresh_sxyprn_thumbs(batch: int = 1200) -> None:
"""Odświeża wygasłe sxyprn miniaturki z żywych stron /post/<id> (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i rotują (token wygasa po
~tygodniach → 404), ale strona post żyje i ma świeży poster (`og:image`). Search/
listingi NIE re-surfaceują starych postów, więc jedyna droga to per-post page fetch.
Bierzemy `batch` najdawniej-aktualizowanych źródeł — cykl po całym katalogu co kilka
dni (mieści się w oknie wygaśnięcia). Patrz scripts/refresh_sxyprn_thumbs.py.
"""
log.info("[scheduler] sxyprn thumb refresh starting (batch=%d)", batch)
def _run() -> None:
from sqlalchemy import text
from app.db import session_scope
from scripts.refresh_sxyprn_thumbs import refresh_batch
with session_scope() as session:
rows = session.execute(
text(
"SELECT id, page_url FROM playback_sources "
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
"ORDER BY updated_at ASC LIMIT :n"
).bindparams(n=batch)
).all()
refreshed, dead, untouched = refresh_batch(list(rows))
log.info(
"[scheduler] sxyprn thumb refresh done: refreshed=%d dead=%d untouched=%d (of %d)",
refreshed, dead, untouched, len(rows),
)
_run_with_timeout(_run, label="sxyprn-thumb-refresh")
def _job_bulk_dedup_performers() -> None:
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
resolver-time scoring nie złapał.
Use case (bug-report 2026-05-20, "brak Brazzers Exxtra po 15-05"):
freshporno scrape przed fixem release_date tworzył duplicate scenes zamiast
PS-merge do canonical TPDB scen. Resolver scoring miał score >0.92 (auto)
z release_date, ale BEZ release_date wagi się przesuwały i wpadało w review/new.
Bulk_dedup performers strategy iteruje per performer, robi pair-wise scoring
dla wszystkich scen tego performera — łapie duplicate-y które ingest-time
resolver pominął (np. gdy 2 sceny tej samej title+performer ale różny release_date).
Auto-merge gdy score≥0.92, pending merge_candidate gdy 0.75-0.92.
"""
log.info("[scheduler] bulk_dedup performers starting")
try:
from app.scheduler.bulk_dedup import run_bulk_dedup
# cross_source_only=True: bez tego flag pairwise generuje N²/2 par na płodnego
# performera, materializowane w listę → worker OOM-killed co 12h (6GB RSS na
# 7.6GB boxie, 2026-06-06), ubijając przy okazji równoległe tpdb/stashdb/ingesty.
# Flag zawęża do cross-source kandydatów (TPDB↔StashDB) z pre-filtrem candidate.
# Timeout-wrap jak tpdb/stashdb — job nie ma własnego hard-timeoutu.
_run_with_timeout(
lambda: run_bulk_dedup(strategy="performers", dry_run=False, cross_source_only=True),
label="bulk-dedup-performers",
)
log.info("[scheduler] bulk_dedup performers done")
except Exception:
log.exception("[scheduler] bulk_dedup performers failed")
def _job_thumb_asset_dedup() -> None:
"""Scala same-video-różne-tytuły dupy hdporn.gg/fullmovies.xxx po asset-id miniatury
(reports 205b17d9/5a2944cb). bulk_dedup tego nie łapie (różne tytuły, brak phash);
re-ingesty pod nowymi tytułami → dupy odrastają, stąd cyklicznie."""
log.info("[scheduler] thumb-asset dedup starting")
try:
from app.scheduler.thumb_dedup import run_thumb_asset_dedup
# run_thumb_asset_dedup loguje liczby/errory wewnętrznie; _run_with_timeout
# połyka return, więc nie kapturujemy.
_run_with_timeout(
lambda: run_thumb_asset_dedup(commit=True),
label="thumb-asset-dedup",
)
log.info("[scheduler] thumb-asset dedup done")
except Exception:
log.exception("[scheduler] thumb-asset dedup failed")
def _job_title_duration_dedup() -> None:
"""Scal missing-merge dupy (ten sam performer + identyczny tytuł + długość) których
bulk_dedup nie łapie (tube re-scrape / cross-tube np. porn00 vs xnxx, reports
28fe8181/32df33b1). playback-only = to co user widzi. Re-ingesty → dupy odrastają."""
log.info("[scheduler] title-duration dedup starting")
try:
from app.scheduler.title_duration_dedup import run_title_duration_dedup
_run_with_timeout(
lambda: run_title_duration_dedup(playback_only=True, commit=True),
label="title-duration-dedup",
)
log.info("[scheduler] title-duration dedup done")
except Exception:
log.exception("[scheduler] title-duration dedup failed")
def _job_ingest_watchdog(max_age_hours: int, search_max_age_hours: int) -> None:
"""Per-origin freshness watchdog — alert do Sentry gdy aktywny tube przestał dawać
nowe sceny > próg (browse: max_age_hours, search: search_max_age_hours). Globalny
monitor (jeden Source 'tube-scraper') tego nie łapie; pojedynczy origin może zamarznąć
przy success-runie (report 14f3a655)."""
try:
from app.scheduler.ingest_watchdog import run_ingest_freshness_watchdog
res = run_ingest_freshness_watchdog(
max_age_hours=max_age_hours,
search_max_age_hours=search_max_age_hours,
)
if res["stale"]:
log.warning("[scheduler] ingest-watchdog: %d stale origin(s)", len(res["stale"]))
except Exception:
log.exception("[scheduler] ingest-watchdog failed")
def _job_hetzner_monitor() -> None:
"""Hetzner Cloud bandwidth monitor — alert do Sentry przy progach % included_traffic.
No-op gdy brak HETZNER_API_TOKEN/SERVER_ID w env (loguje że wyłączony)."""
try:
from app.scheduler.hetzner_monitor import run_hetzner_bandwidth_check
run_hetzner_bandwidth_check()
except Exception:
log.exception("[scheduler] hetzner-monitor failed")
def _job_performer_continuous(refresh_after_days: int) -> None:
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
Per tick: full search across ~25 tubeów (per_performer_limit=None). Tick zajmuje
~50-80s. Interval ustawiony na 15s + max_instances=1 + coalesce=True znaczy że
real rate to max(15s, tick_duration) — efektywnie ~1 perf/50-80s.
"""
try:
run_continuous_one_at_a_time(
refresh_after_days=refresh_after_days,
per_performer_limit=None, # full coverage all tubes
)
except Exception:
log.exception("[scheduler] performer-continuous failed")
def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
"""Buduje scheduler na podstawie cfg dictu.
cfg keys:
tpdb_hours: int | None (None = wyłączony)
stashdb_hours: int | None
performer_driven_hours: int | None
performer_driven_top_n: int
performer_continuous_seconds: int | None
performer_continuous_refresh_days: int
"""
sched = BlockingScheduler(timezone="UTC")
if cfg.get("tpdb_hours"):
sched.add_job(
_job_tpdb,
IntervalTrigger(hours=cfg["tpdb_hours"], start_date=INTERVAL_ANCHOR),
id="tpdb",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: tpdb every %dh", cfg["tpdb_hours"])
if cfg.get("stashdb_hours"):
sched.add_job(
_job_stashdb,
IntervalTrigger(hours=cfg["stashdb_hours"], start_date=INTERVAL_ANCHOR),
id="stashdb",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: stashdb every %dh", cfg["stashdb_hours"])
if cfg.get("performer_driven_hours"):
top_n = cfg.get("performer_driven_top_n") or 20
sched.add_job(
lambda: _job_performer_driven(top_n),
IntervalTrigger(hours=cfg["performer_driven_hours"], start_date=INTERVAL_ANCHOR),
id="performer_driven",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: performer-driven every %dh (top_n=%d)",
cfg["performer_driven_hours"],
top_n,
)
if cfg.get("browse_latest_hours"):
max_pages = cfg.get("browse_latest_max_pages") or 5
sched.add_job(
lambda: _job_browse_latest(max_pages),
IntervalTrigger(hours=cfg["browse_latest_hours"], start_date=INTERVAL_ANCHOR),
id="browse_latest",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: browse-latest every %dh (max_pages=%d)",
cfg["browse_latest_hours"], max_pages,
)
if cfg.get("bulk_dedup_hours"):
sched.add_job(
_job_bulk_dedup_performers,
IntervalTrigger(hours=cfg["bulk_dedup_hours"], start_date=INTERVAL_ANCHOR),
id="bulk_dedup_performers",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: bulk-dedup performers every %dh", cfg["bulk_dedup_hours"])
if cfg.get("thumb_dedup_hours"):
sched.add_job(
_job_thumb_asset_dedup,
IntervalTrigger(hours=cfg["thumb_dedup_hours"], start_date=INTERVAL_ANCHOR),
id="thumb_asset_dedup",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: thumb-asset dedup every %dh", cfg["thumb_dedup_hours"])
if cfg.get("title_dedup_hours"):
sched.add_job(
_job_title_duration_dedup,
IntervalTrigger(hours=cfg["title_dedup_hours"], start_date=INTERVAL_ANCHOR),
id="title_duration_dedup",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: title-duration dedup every %dh", cfg["title_dedup_hours"])
if cfg.get("ingest_watchdog_hours"):
wd_max_age = cfg.get("ingest_watchdog_max_age_hours") or 48
wd_search_max_age = cfg.get("ingest_watchdog_search_max_age_hours") or 168
sched.add_job(
lambda: _job_ingest_watchdog(wd_max_age, wd_search_max_age),
IntervalTrigger(hours=cfg["ingest_watchdog_hours"], start_date=INTERVAL_ANCHOR),
id="ingest_watchdog",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: ingest-watchdog every %dh (browse max_age=%dh, search max_age=%dh)",
cfg["ingest_watchdog_hours"], wd_max_age, wd_search_max_age,
)
if cfg.get("hetzner_monitor_hours"):
sched.add_job(
_job_hetzner_monitor,
IntervalTrigger(hours=cfg["hetzner_monitor_hours"], start_date=INTERVAL_ANCHOR),
id="hetzner_monitor",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: hetzner-monitor every %dh", cfg["hetzner_monitor_hours"])
if cfg.get("movie_ingest_hours"):
sched.add_job(
_job_movie_ingest,
IntervalTrigger(hours=cfg["movie_ingest_hours"], start_date=INTERVAL_ANCHOR),
id="movie_ingest",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"])
if cfg.get("taxonomy_counts_hours"):
sched.add_job(
_job_refresh_taxonomy_counts,
IntervalTrigger(hours=cfg["taxonomy_counts_hours"], start_date=INTERVAL_ANCHOR),
id="taxonomy_counts",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: taxonomy-counts refresh every %dh", cfg["taxonomy_counts_hours"])
if cfg.get("performer_continuous_seconds"):
refresh_days = cfg.get("performer_continuous_refresh_days") or 30
seconds = cfg["performer_continuous_seconds"]
sched.add_job(
lambda: _job_performer_continuous(refresh_days),
IntervalTrigger(seconds=seconds),
id="performer_continuous",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: performer-continuous every %ds (refresh_after=%dd)",
seconds, refresh_days,
)
if cfg.get("deep_crawl_hours"):
pages = cfg.get("deep_crawl_pages_per_run") or 60
sched.add_job(
lambda: _job_deep_crawl(pages),
IntervalTrigger(hours=cfg["deep_crawl_hours"], start_date=INTERVAL_ANCHOR),
id="deep_crawl",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: deep-crawl every %dh (%d pages/run)", cfg["deep_crawl_hours"], pages)
# Periodic reaper — czyści zombie 'running' runy co godzinę. Domyślnie ZAWSZE on
# (cfg.get(...,1)), bo startup-only reaper nie łapie zawisów gdy worker długo żyje.
reap_hours = cfg.get("reap_stuck_hours", 1)
if reap_hours:
sched.add_job(
_job_reap_stuck,
IntervalTrigger(hours=reap_hours, start_date=INTERVAL_ANCHOR),
id="reap_stuck",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: reap-stuck every %dh", reap_hours)
# sxyprn thumbnail refresh — WYŁĄCZONE (default 0). Token trafficdeposit jest
# bucketowany godzinowo i ważny ~1h (weryfikacja 2026-06-10: stored ts=11:00 martwy
# o 12:27, aktualny ts=13:00 żyje). Przechowywanie URL-i jest bezcelowe — wygasają
# w godzinę, więc periodyczny refresh tylko wali w sxyprn na darmo. Działające
# thumbnaile sxyprn wymagają ON-DEMAND resolve przy serwowaniu (proxy fetch post
# page → bieżący og:image, cache ~45min). Job zostaje w kodzie ale domyślnie off.
sxyprn_hours = cfg.get("sxyprn_thumb_refresh_hours", 0)
if sxyprn_hours:
batch = cfg.get("sxyprn_thumb_refresh_batch", 1200)
sched.add_job(
lambda: _job_refresh_sxyprn_thumbs(batch),
IntervalTrigger(hours=sxyprn_hours, start_date=INTERVAL_ANCHOR),
id="sxyprn_thumb_refresh",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: sxyprn-thumb-refresh every %dh (batch=%d)", sxyprn_hours, batch)
return sched
DEFAULT_CONFIG: dict[str, Any] = {
"tpdb_hours": 6,
"stashdb_hours": 6,
"sxyprn_thumb_refresh_hours": 0, # off — token ~1h TTL, refresh bezcelowy (patrz register_jobs)
"sxyprn_thumb_refresh_batch": 1200,
"performer_driven_hours": 12,
"performer_driven_top_n": 20,
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100
# scen/tube/run łapie świeże sceny których performera jeszcze nie znamy (newcomerki
# → canonical ingest dorobi potem). NB: ten DEFAULT_CONFIG jest poglądowy — realnie
# run_forever() bierze interwały z Settings (config.py: sched_browse_latest_hours=6).
"browse_latest_hours": 6,
"browse_latest_max_pages": 5,
# Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną
# wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback
# sources do paradisehill movies → mobile gra natywnie zamiast WebView.
"movie_ingest_hours": 24,
# Continuous worker: tick co 15s, ale max_instances=1 + coalesce sprawia że
# efektywny rate = max(15s, tick_duration). Tick z full coverage (25 tubes) ~50-80s,
# więc realnie ~1 perf/60s. Przy 14.7k performerów = ~10 dni full sweep + refresh
# każdego co 30 dni.
"performer_continuous_seconds": 15,
"performer_continuous_refresh_days": 30,
# Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers|
# /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia.
"taxonomy_counts_hours": 3,
# Periodic reaper zombie 'running' runów — co 1h (próg 'running'>2h w funkcji).
"reap_stuck_hours": 1,
# Deep-crawl pełnych katalogów browse-tube'ów (Faza 2a). Co 1h, 60 stron/run,
# round-robin po tube'ach. Backfill katalogów (porndoe ~62k itd.) przez dni.
"deep_crawl_hours": 1,
"deep_crawl_pages_per_run": 60,
# Ingest freshness watchdog — per-origin alert do Sentry, co 6h. Browse próg 48h,
# search (performer-driven) próg 7d (nierówna kadencja).
"ingest_watchdog_hours": 6,
"ingest_watchdog_max_age_hours": 48,
"ingest_watchdog_search_max_age_hours": 168,
# Hetzner Cloud bandwidth monitor — co 6h, alert Sentry przy progach % included.
"hetzner_monitor_hours": 6,
# Title+duration dedup — missing-merge tube-dupy, co 12h, playback-only.
"title_dedup_hours": 12,
}