"""APScheduler job definitions dla worker'a (M5). Domyślny harmonogram: - tpdb — co 6h, delta od ostatniego successful run - stashdb — co 6h, delta - performer-driven — co 12h, top-N performerów z bazy (auto-discovers nowe sceny przez ALL_DIRECT_SCRAPERS — 25 tube'ów per-tube HTTP scraping) - performer-continuous — tick co N sekund, 1 performer per tick (ORDER BY last_searched_at) Konfigurację (interwały, włącz/wyłącz) można nadpisać przez env (`GOON_SCHED_*`), patrz `app/scheduler/config.py`. Uwaga: APScheduler in-process (BlockingScheduler) — wystarczy dla self-hosted single worker. Dla multi-worker trzebaby Redis/SQLAlchemy job store + distributed lock. """ from __future__ import annotations import logging from typing import Any from apscheduler.schedulers.blocking import BlockingScheduler from apscheduler.triggers.interval import IntervalTrigger from app.connectors import get_movie_connectors from app.connectors.stashdb import StashDBConnector from app.connectors.tpdb import TPDBConnector from app.ingest import ingest_from_connector, ingest_movies_from_connector from app.scheduler.browse_latest import run_browse_latest from app.scheduler.performer_driven import run_continuous_one_at_a_time, run_performer_driven log = logging.getLogger(__name__) def _job_tpdb() -> None: log.info("[scheduler] tpdb delta starting") try: ingest_from_connector(TPDBConnector(), use_delta=True) except Exception: log.exception("[scheduler] tpdb job failed") def _job_stashdb() -> None: log.info("[scheduler] stashdb delta starting") try: ingest_from_connector(StashDBConnector(), use_delta=True) except Exception: log.exception("[scheduler] stashdb job failed") def _job_performer_driven(top_n: int) -> None: log.info("[scheduler] performer-driven top-%d starting", top_n) try: run_performer_driven( top_n=top_n, per_performer_limit=50, ) except Exception: log.exception("[scheduler] performer-driven job failed") def _job_browse_latest(max_pages: int) -> None: """Browse-latest — scrap newest scenes z rich-metadata tubes (shyfap + ...). Komplementarny do performer-driven: forward-fill (new scenes) vs backward (known performers). """ log.info("[scheduler] browse-latest starting (max_pages=%d)", max_pages) try: run_browse_latest(max_pages=max_pages) except Exception: log.exception("[scheduler] browse-latest job failed") def _job_movie_ingest() -> None: """Movies ingest — paradisehill (primary) + dooplay mirrory. Paradisehill jako primary daje canonical movie record (title + year + studio). Mirrory dooplay (mangoporn/streamporn/pandamovies) doklejają playback sources z native-friendly origins (mangoporn:luluvid, :voe, etc.) — `extract_stream_from_hoster` rozwiązuje je do bezpośredniego stream URL → mobile gra natywnie zamiast WebView. Matching mirror→primary movie idzie przez `resolve_movie` (title+year+studio similarity). Każdy connector osobny IngestRun + delta od ostatniego success. Kolejność: paradisehill FIRST (żeby mirrory miały do czego się przykleić), potem mirrory. Pojedynczy failed connector NIE zatrzymuje pozostałych — każdy w osobnym try/except. """ for name, cls in get_movie_connectors(): log.info("[scheduler] movie ingest %s starting", name) try: ingest_movies_from_connector(cls(), use_delta=True) except Exception: log.exception("[scheduler] movie ingest %s failed", name) def _job_performer_continuous(refresh_after_days: int) -> None: """Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST. Per tick: full search across ~25 tubeów (per_performer_limit=None). Tick zajmuje ~50-80s. Interval ustawiony na 15s + max_instances=1 + coalesce=True znaczy że real rate to max(15s, tick_duration) — efektywnie ~1 perf/50-80s. """ try: run_continuous_one_at_a_time( refresh_after_days=refresh_after_days, per_performer_limit=None, # full coverage all tubes ) except Exception: log.exception("[scheduler] performer-continuous failed") def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler: """Buduje scheduler na podstawie cfg dictu. cfg keys: tpdb_hours: int | None (None = wyłączony) stashdb_hours: int | None performer_driven_hours: int | None performer_driven_top_n: int performer_continuous_seconds: int | None performer_continuous_refresh_days: int """ sched = BlockingScheduler(timezone="UTC") if cfg.get("tpdb_hours"): sched.add_job( _job_tpdb, IntervalTrigger(hours=cfg["tpdb_hours"]), id="tpdb", replace_existing=True, max_instances=1, coalesce=True, ) log.info("scheduler: tpdb every %dh", cfg["tpdb_hours"]) if cfg.get("stashdb_hours"): sched.add_job( _job_stashdb, IntervalTrigger(hours=cfg["stashdb_hours"]), id="stashdb", replace_existing=True, max_instances=1, coalesce=True, ) log.info("scheduler: stashdb every %dh", cfg["stashdb_hours"]) if cfg.get("performer_driven_hours"): top_n = cfg.get("performer_driven_top_n") or 20 sched.add_job( lambda: _job_performer_driven(top_n), IntervalTrigger(hours=cfg["performer_driven_hours"]), id="performer_driven", replace_existing=True, max_instances=1, coalesce=True, ) log.info( "scheduler: performer-driven every %dh (top_n=%d)", cfg["performer_driven_hours"], top_n, ) if cfg.get("browse_latest_hours"): max_pages = cfg.get("browse_latest_max_pages") or 5 sched.add_job( lambda: _job_browse_latest(max_pages), IntervalTrigger(hours=cfg["browse_latest_hours"]), id="browse_latest", replace_existing=True, max_instances=1, coalesce=True, ) log.info( "scheduler: browse-latest every %dh (max_pages=%d)", cfg["browse_latest_hours"], max_pages, ) if cfg.get("movie_ingest_hours"): sched.add_job( _job_movie_ingest, IntervalTrigger(hours=cfg["movie_ingest_hours"]), id="movie_ingest", replace_existing=True, max_instances=1, coalesce=True, ) log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"]) if cfg.get("performer_continuous_seconds"): refresh_days = cfg.get("performer_continuous_refresh_days") or 30 seconds = cfg["performer_continuous_seconds"] sched.add_job( lambda: _job_performer_continuous(refresh_days), IntervalTrigger(seconds=seconds), id="performer_continuous", replace_existing=True, max_instances=1, coalesce=True, ) log.info( "scheduler: performer-continuous every %ds (refresh_after=%dd)", seconds, refresh_days, ) return sched DEFAULT_CONFIG: dict[str, Any] = { "tpdb_hours": 6, "stashdb_hours": 6, "performer_driven_hours": 12, "performer_driven_top_n": 20, # Browse-latest — newest scenes z rich-metadata tubes (shyfap, ...). Raz dziennie # × ~100 scen/tube/run = drobny budżet, łapie świeże sceny których performera jeszcze # nie znamy (newcomerki → canonical ingest dorobi potem). "browse_latest_hours": 24, "browse_latest_max_pages": 5, # Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną # wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback # sources do paradisehill movies → mobile gra natywnie zamiast WebView. "movie_ingest_hours": 24, # Continuous worker: tick co 15s, ale max_instances=1 + coalesce sprawia że # efektywny rate = max(15s, tick_duration). Tick z full coverage (25 tubes) ~50-80s, # więc realnie ~1 perf/60s. Przy 14.7k performerów = ~10 dni full sweep + refresh # każdego co 30 dni. "performer_continuous_seconds": 15, "performer_continuous_refresh_days": 30, }