goon/app/scheduler/jobs.py
https://github.com/goon-foss/goon 642f1ab8b8 Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector
Mobile / OTA:
- Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org
- Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version)
- backend.ts: default public backend auto-connect (no manual login)

WebView fallback fix (PlayerScreen INJECTED_JS):
- Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init)
- Context-scoped: only clicks consent buttons inside cookie/gdpr containers
- Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init)

Resolver:
- Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited)
  → composite_score hard-reject / cap; wired into scene_score + bulk_dedup
- aggregator-mode candidate query: LIMIT 500 + title-match ordering

Connectors:
- porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot

landing: APK links → goon-v0.1.9.apk

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 11:20:57 +02:00

261 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""APScheduler job definitions dla worker'a (M5).
Domyślny harmonogram:
- tpdb — co 6h, delta od ostatniego successful run
- stashdb — co 6h, delta
- performer-driven — co 12h, top-N performerów z bazy (auto-discovers nowe sceny przez
ALL_DIRECT_SCRAPERS — 25 tube'ów per-tube HTTP scraping)
- performer-continuous — tick co N sekund, 1 performer per tick (ORDER BY last_searched_at)
Konfigurację (interwały, włącz/wyłącz) można nadpisać przez env (`GOON_SCHED_*`),
patrz `app/scheduler/config.py`.
Uwaga: APScheduler in-process (BlockingScheduler) — wystarczy dla self-hosted single
worker. Dla multi-worker trzebaby Redis/SQLAlchemy job store + distributed lock.
"""
from __future__ import annotations
import logging
from typing import Any
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
from app.connectors import get_movie_connectors
from app.connectors.stashdb import StashDBConnector
from app.connectors.tpdb import TPDBConnector
from app.ingest import ingest_from_connector, ingest_movies_from_connector
from app.scheduler.browse_latest import run_browse_latest
from app.scheduler.performer_driven import run_continuous_one_at_a_time, run_performer_driven
log = logging.getLogger(__name__)
def _job_tpdb() -> None:
log.info("[scheduler] tpdb delta starting")
try:
ingest_from_connector(TPDBConnector(), use_delta=True)
except Exception:
log.exception("[scheduler] tpdb job failed")
def _job_stashdb() -> None:
log.info("[scheduler] stashdb delta starting")
try:
ingest_from_connector(StashDBConnector(), use_delta=True)
except Exception:
log.exception("[scheduler] stashdb job failed")
def _job_performer_driven(top_n: int) -> None:
log.info("[scheduler] performer-driven top-%d starting", top_n)
try:
run_performer_driven(
top_n=top_n,
per_performer_limit=50,
)
except Exception:
log.exception("[scheduler] performer-driven job failed")
def _job_browse_latest(max_pages: int) -> None:
"""Browse-latest — scrap newest scenes z rich-metadata tubes (shyfap + ...).
Komplementarny do performer-driven: forward-fill (new scenes) vs backward (known performers).
"""
log.info("[scheduler] browse-latest starting (max_pages=%d)", max_pages)
try:
run_browse_latest(max_pages=max_pages)
except Exception:
log.exception("[scheduler] browse-latest job failed")
def _job_movie_ingest() -> None:
"""Movies ingest — paradisehill (primary) + dooplay mirrory.
Paradisehill jako primary daje canonical movie record (title + year + studio).
Mirrory dooplay (mangoporn/streamporn/pandamovies) doklejają playback sources
z native-friendly origins (mangoporn:luluvid, :voe, etc.) — `extract_stream_from_hoster`
rozwiązuje je do bezpośredniego stream URL → mobile gra natywnie zamiast WebView.
Matching mirror→primary movie idzie przez `resolve_movie` (title+year+studio
similarity). Każdy connector osobny IngestRun + delta od ostatniego success.
Kolejność: paradisehill FIRST (żeby mirrory miały do czego się przykleić),
potem mirrory. Pojedynczy failed connector NIE zatrzymuje pozostałych —
każdy w osobnym try/except.
"""
for name, cls in get_movie_connectors():
log.info("[scheduler] movie ingest %s starting", name)
try:
ingest_movies_from_connector(cls(), use_delta=True)
except Exception:
log.exception("[scheduler] movie ingest %s failed", name)
def _job_bulk_dedup_performers() -> None:
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
resolver-time scoring nie złapał.
Use case (bug-report 2026-05-20, "brak Brazzers Exxtra po 15-05"):
freshporno scrape przed fixem release_date tworzył duplicate scenes zamiast
PS-merge do canonical TPDB scen. Resolver scoring miał score >0.92 (auto)
z release_date, ale BEZ release_date wagi się przesuwały i wpadało w review/new.
Bulk_dedup performers strategy iteruje per performer, robi pair-wise scoring
dla wszystkich scen tego performera — łapie duplicate-y które ingest-time
resolver pominął (np. gdy 2 sceny tej samej title+performer ale różny release_date).
Auto-merge gdy score≥0.92, pending merge_candidate gdy 0.75-0.92.
"""
log.info("[scheduler] bulk_dedup performers starting")
try:
from app.scheduler.bulk_dedup import run_bulk_dedup
bc = run_bulk_dedup(strategy="performers", dry_run=False)
log.info("[scheduler] bulk_dedup performers done: %s", bc)
except Exception:
log.exception("[scheduler] bulk_dedup performers failed")
def _job_performer_continuous(refresh_after_days: int) -> None:
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
Per tick: full search across ~25 tubeów (per_performer_limit=None). Tick zajmuje
~50-80s. Interval ustawiony na 15s + max_instances=1 + coalesce=True znaczy że
real rate to max(15s, tick_duration) — efektywnie ~1 perf/50-80s.
"""
try:
run_continuous_one_at_a_time(
refresh_after_days=refresh_after_days,
per_performer_limit=None, # full coverage all tubes
)
except Exception:
log.exception("[scheduler] performer-continuous failed")
def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
"""Buduje scheduler na podstawie cfg dictu.
cfg keys:
tpdb_hours: int | None (None = wyłączony)
stashdb_hours: int | None
performer_driven_hours: int | None
performer_driven_top_n: int
performer_continuous_seconds: int | None
performer_continuous_refresh_days: int
"""
sched = BlockingScheduler(timezone="UTC")
if cfg.get("tpdb_hours"):
sched.add_job(
_job_tpdb,
IntervalTrigger(hours=cfg["tpdb_hours"]),
id="tpdb",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: tpdb every %dh", cfg["tpdb_hours"])
if cfg.get("stashdb_hours"):
sched.add_job(
_job_stashdb,
IntervalTrigger(hours=cfg["stashdb_hours"]),
id="stashdb",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: stashdb every %dh", cfg["stashdb_hours"])
if cfg.get("performer_driven_hours"):
top_n = cfg.get("performer_driven_top_n") or 20
sched.add_job(
lambda: _job_performer_driven(top_n),
IntervalTrigger(hours=cfg["performer_driven_hours"]),
id="performer_driven",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: performer-driven every %dh (top_n=%d)",
cfg["performer_driven_hours"],
top_n,
)
if cfg.get("browse_latest_hours"):
max_pages = cfg.get("browse_latest_max_pages") or 5
sched.add_job(
lambda: _job_browse_latest(max_pages),
IntervalTrigger(hours=cfg["browse_latest_hours"]),
id="browse_latest",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: browse-latest every %dh (max_pages=%d)",
cfg["browse_latest_hours"], max_pages,
)
if cfg.get("bulk_dedup_hours"):
sched.add_job(
_job_bulk_dedup_performers,
IntervalTrigger(hours=cfg["bulk_dedup_hours"]),
id="bulk_dedup_performers",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: bulk-dedup performers every %dh", cfg["bulk_dedup_hours"])
if cfg.get("movie_ingest_hours"):
sched.add_job(
_job_movie_ingest,
IntervalTrigger(hours=cfg["movie_ingest_hours"]),
id="movie_ingest",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"])
if cfg.get("performer_continuous_seconds"):
refresh_days = cfg.get("performer_continuous_refresh_days") or 30
seconds = cfg["performer_continuous_seconds"]
sched.add_job(
lambda: _job_performer_continuous(refresh_days),
IntervalTrigger(seconds=seconds),
id="performer_continuous",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info(
"scheduler: performer-continuous every %ds (refresh_after=%dd)",
seconds, refresh_days,
)
return sched
DEFAULT_CONFIG: dict[str, Any] = {
"tpdb_hours": 6,
"stashdb_hours": 6,
"performer_driven_hours": 12,
"performer_driven_top_n": 20,
# Browse-latest — newest scenes z rich-metadata tubes (shyfap, ...). Raz dziennie
# × ~100 scen/tube/run = drobny budżet, łapie świeże sceny których performera jeszcze
# nie znamy (newcomerki → canonical ingest dorobi potem).
"browse_latest_hours": 24,
"browse_latest_max_pages": 5,
# Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną
# wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback
# sources do paradisehill movies → mobile gra natywnie zamiast WebView.
"movie_ingest_hours": 24,
# Continuous worker: tick co 15s, ale max_instances=1 + coalesce sprawia że
# efektywny rate = max(15s, tick_duration). Tick z full coverage (25 tubes) ~50-80s,
# więc realnie ~1 perf/60s. Przy 14.7k performerów = ~10 dni full sweep + refresh
# każdego co 30 dni.
"performer_continuous_seconds": 15,
"performer_continuous_refresh_days": 30,
}