Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
227 lines
8.3 KiB
Python
227 lines
8.3 KiB
Python
"""APScheduler job definitions dla worker'a (M5).
|
||
|
||
Domyślny harmonogram:
|
||
- tpdb — co 6h, delta od ostatniego successful run
|
||
- stashdb — co 6h, delta
|
||
- performer-driven — co 12h, top-N performerów z bazy (auto-discovers nowe sceny przez
|
||
ALL_DIRECT_SCRAPERS — 25 tube'ów per-tube HTTP scraping)
|
||
- performer-continuous — tick co N sekund, 1 performer per tick (ORDER BY last_searched_at)
|
||
|
||
Konfigurację (interwały, włącz/wyłącz) można nadpisać przez env (`GOON_SCHED_*`),
|
||
patrz `app/scheduler/config.py`.
|
||
|
||
Uwaga: APScheduler in-process (BlockingScheduler) — wystarczy dla self-hosted single
|
||
worker. Dla multi-worker trzebaby Redis/SQLAlchemy job store + distributed lock.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Any
|
||
|
||
from apscheduler.schedulers.blocking import BlockingScheduler
|
||
from apscheduler.triggers.interval import IntervalTrigger
|
||
|
||
from app.connectors import get_movie_connectors
|
||
from app.connectors.stashdb import StashDBConnector
|
||
from app.connectors.tpdb import TPDBConnector
|
||
from app.ingest import ingest_from_connector, ingest_movies_from_connector
|
||
from app.scheduler.browse_latest import run_browse_latest
|
||
from app.scheduler.performer_driven import run_continuous_one_at_a_time, run_performer_driven
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
def _job_tpdb() -> None:
|
||
log.info("[scheduler] tpdb delta starting")
|
||
try:
|
||
ingest_from_connector(TPDBConnector(), use_delta=True)
|
||
except Exception:
|
||
log.exception("[scheduler] tpdb job failed")
|
||
|
||
|
||
def _job_stashdb() -> None:
|
||
log.info("[scheduler] stashdb delta starting")
|
||
try:
|
||
ingest_from_connector(StashDBConnector(), use_delta=True)
|
||
except Exception:
|
||
log.exception("[scheduler] stashdb job failed")
|
||
|
||
|
||
def _job_performer_driven(top_n: int) -> None:
|
||
log.info("[scheduler] performer-driven top-%d starting", top_n)
|
||
try:
|
||
run_performer_driven(
|
||
top_n=top_n,
|
||
per_performer_limit=50,
|
||
)
|
||
except Exception:
|
||
log.exception("[scheduler] performer-driven job failed")
|
||
|
||
|
||
def _job_browse_latest(max_pages: int) -> None:
|
||
"""Browse-latest — scrap newest scenes z rich-metadata tubes (shyfap + ...).
|
||
Komplementarny do performer-driven: forward-fill (new scenes) vs backward (known performers).
|
||
"""
|
||
log.info("[scheduler] browse-latest starting (max_pages=%d)", max_pages)
|
||
try:
|
||
run_browse_latest(max_pages=max_pages)
|
||
except Exception:
|
||
log.exception("[scheduler] browse-latest job failed")
|
||
|
||
|
||
def _job_movie_ingest() -> None:
|
||
"""Movies ingest — paradisehill (primary) + dooplay mirrory.
|
||
|
||
Paradisehill jako primary daje canonical movie record (title + year + studio).
|
||
Mirrory dooplay (mangoporn/streamporn/pandamovies) doklejają playback sources
|
||
z native-friendly origins (mangoporn:luluvid, :voe, etc.) — `extract_stream_from_hoster`
|
||
rozwiązuje je do bezpośredniego stream URL → mobile gra natywnie zamiast WebView.
|
||
|
||
Matching mirror→primary movie idzie przez `resolve_movie` (title+year+studio
|
||
similarity). Każdy connector osobny IngestRun + delta od ostatniego success.
|
||
|
||
Kolejność: paradisehill FIRST (żeby mirrory miały do czego się przykleić),
|
||
potem mirrory. Pojedynczy failed connector NIE zatrzymuje pozostałych —
|
||
każdy w osobnym try/except.
|
||
"""
|
||
for name, cls in get_movie_connectors():
|
||
log.info("[scheduler] movie ingest %s starting", name)
|
||
try:
|
||
ingest_movies_from_connector(cls(), use_delta=True)
|
||
except Exception:
|
||
log.exception("[scheduler] movie ingest %s failed", name)
|
||
|
||
|
||
def _job_performer_continuous(refresh_after_days: int) -> None:
|
||
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
|
||
|
||
Per tick: full search across ~25 tubeów (per_performer_limit=None). Tick zajmuje
|
||
~50-80s. Interval ustawiony na 15s + max_instances=1 + coalesce=True znaczy że
|
||
real rate to max(15s, tick_duration) — efektywnie ~1 perf/50-80s.
|
||
"""
|
||
try:
|
||
run_continuous_one_at_a_time(
|
||
refresh_after_days=refresh_after_days,
|
||
per_performer_limit=None, # full coverage all tubes
|
||
)
|
||
except Exception:
|
||
log.exception("[scheduler] performer-continuous failed")
|
||
|
||
|
||
def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
||
"""Buduje scheduler na podstawie cfg dictu.
|
||
|
||
cfg keys:
|
||
tpdb_hours: int | None (None = wyłączony)
|
||
stashdb_hours: int | None
|
||
performer_driven_hours: int | None
|
||
performer_driven_top_n: int
|
||
performer_continuous_seconds: int | None
|
||
performer_continuous_refresh_days: int
|
||
"""
|
||
sched = BlockingScheduler(timezone="UTC")
|
||
|
||
if cfg.get("tpdb_hours"):
|
||
sched.add_job(
|
||
_job_tpdb,
|
||
IntervalTrigger(hours=cfg["tpdb_hours"]),
|
||
id="tpdb",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info("scheduler: tpdb every %dh", cfg["tpdb_hours"])
|
||
|
||
if cfg.get("stashdb_hours"):
|
||
sched.add_job(
|
||
_job_stashdb,
|
||
IntervalTrigger(hours=cfg["stashdb_hours"]),
|
||
id="stashdb",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info("scheduler: stashdb every %dh", cfg["stashdb_hours"])
|
||
|
||
if cfg.get("performer_driven_hours"):
|
||
top_n = cfg.get("performer_driven_top_n") or 20
|
||
sched.add_job(
|
||
lambda: _job_performer_driven(top_n),
|
||
IntervalTrigger(hours=cfg["performer_driven_hours"]),
|
||
id="performer_driven",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info(
|
||
"scheduler: performer-driven every %dh (top_n=%d)",
|
||
cfg["performer_driven_hours"],
|
||
top_n,
|
||
)
|
||
|
||
if cfg.get("browse_latest_hours"):
|
||
max_pages = cfg.get("browse_latest_max_pages") or 5
|
||
sched.add_job(
|
||
lambda: _job_browse_latest(max_pages),
|
||
IntervalTrigger(hours=cfg["browse_latest_hours"]),
|
||
id="browse_latest",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info(
|
||
"scheduler: browse-latest every %dh (max_pages=%d)",
|
||
cfg["browse_latest_hours"], max_pages,
|
||
)
|
||
|
||
if cfg.get("movie_ingest_hours"):
|
||
sched.add_job(
|
||
_job_movie_ingest,
|
||
IntervalTrigger(hours=cfg["movie_ingest_hours"]),
|
||
id="movie_ingest",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"])
|
||
|
||
if cfg.get("performer_continuous_seconds"):
|
||
refresh_days = cfg.get("performer_continuous_refresh_days") or 30
|
||
seconds = cfg["performer_continuous_seconds"]
|
||
sched.add_job(
|
||
lambda: _job_performer_continuous(refresh_days),
|
||
IntervalTrigger(seconds=seconds),
|
||
id="performer_continuous",
|
||
replace_existing=True,
|
||
max_instances=1,
|
||
coalesce=True,
|
||
)
|
||
log.info(
|
||
"scheduler: performer-continuous every %ds (refresh_after=%dd)",
|
||
seconds, refresh_days,
|
||
)
|
||
|
||
return sched
|
||
|
||
|
||
DEFAULT_CONFIG: dict[str, Any] = {
|
||
"tpdb_hours": 6,
|
||
"stashdb_hours": 6,
|
||
"performer_driven_hours": 12,
|
||
"performer_driven_top_n": 20,
|
||
# Browse-latest — newest scenes z rich-metadata tubes (shyfap, ...). Raz dziennie
|
||
# × ~100 scen/tube/run = drobny budżet, łapie świeże sceny których performera jeszcze
|
||
# nie znamy (newcomerki → canonical ingest dorobi potem).
|
||
"browse_latest_hours": 24,
|
||
"browse_latest_max_pages": 5,
|
||
# Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną
|
||
# wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback
|
||
# sources do paradisehill movies → mobile gra natywnie zamiast WebView.
|
||
"movie_ingest_hours": 24,
|
||
# Continuous worker: tick co 15s, ale max_instances=1 + coalesce sprawia że
|
||
# efektywny rate = max(15s, tick_duration). Tick z full coverage (25 tubes) ~50-80s,
|
||
# więc realnie ~1 perf/60s. Przy 14.7k performerów = ~10 dni full sweep + refresh
|
||
# każdego co 30 dni.
|
||
"performer_continuous_seconds": 15,
|
||
"performer_continuous_refresh_days": 30,
|
||
}
|