goon/app/config.py
jtrzupek 0424cb9138 feat(scheduler): per-origin ingest freshness watchdog -> Sentry
The global source monitor can't catch a single stalled tube because every tube scraper shares one Source row (tube-scraper), so an aggregate run still reports success while one origin freezes (freshporno browsing the rotating KVS homepage root, report 14f3a655). New watchdog checks max(created_at) per active browse-scraper origin (tube:<sitetag>); if a tube with history hasn't produced a new scene in > max_age_hours it fires a Sentry message with a stable per-origin fingerprint (age in extras, not the title, so it stays one grouped issue). Runs every 6h, 48h threshold, both env-tunable (GOON_SCHED_INGEST_WATCHDOG_HOURS / GOON_INGEST_WATCHDOG_MAX_AGE_HOURS). Verified: 0 stale at 48h post-fix, detects neporn at a strict 12h threshold.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-15 10:26:25 +02:00

180 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
database_url: str = Field(
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
validation_alias="DATABASE_URL",
)
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
tpdb_base_url: str = Field(
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
)
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
stashdb_graphql_url: str = Field(
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
)
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
# 5k errors/mies wystarczy dla 1-user app.
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
sentry_traces_sample_rate: float = Field(
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
)
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
Re-packaging APK innym keystorem zmienia hash → 403."""
auto_merge_threshold: float = 0.92
review_threshold: float = 0.75
fingerprint_hamming_max: int = 5
title_token_set_min: int = 88
date_window_days: int = 7
# Skip ingestu clip-store (ManyVids/IWantClips/Clips4Sale/...) z canonical source —
# to permanentne orphany (free tubes nie hostują), ~56% ingestu TPDB/StashDB.
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
# Minimalny duration sceny z tube/scraper przy ingescie — <N s = trailer/teaser/preview.
# 0 = wyłączony. Nieznany duration nie jest wycinany. NIE dotyczy canonical (TPDB/StashDB).
min_ingest_duration_sec: int = Field(default=180, validation_alias="GOON_MIN_INGEST_DURATION_SEC")
# APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
sched_performer_driven_hours: int = Field(
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
)
sched_performer_driven_top_n: int = Field(
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
)
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
sched_performer_continuous_seconds: int = Field(
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
)
sched_performer_continuous_refresh_days: int = Field(
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
)
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
)
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
sched_browse_latest_hours: int = Field(
default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
)
sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
)
# Deep-crawl (Faza 2a) — pełne katalogi browse-tube'ów (porndoe ~62k itd.), nie tylko
# top-N. Round-robin po tube'ach, wznawialny kursor (app/_state/deepcrawl_state.json).
# 0 = wyłączony. 60 stron/run × ~31 scen ≈ 1860 scen/run (~22 min, hard-timeout 1h).
sched_deep_crawl_hours: int = Field(default=1, validation_alias="GOON_SCHED_DEEP_CRAWL_HOURS")
deep_crawl_pages_per_run: int = Field(default=60, validation_alias="GOON_DEEP_CRAWL_PAGES_PER_RUN")
deepcrawl_state_path: str = Field(default="", validation_alias="GOON_DEEPCRAWL_STATE_PATH")
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Thumb-asset dedup — scala dupy hdporn.gg/fullmovies.xxx (ten sam film, różne tytuły,
# ten sam asset-id miniatury + długość). bulk_dedup tego nie łapie (brak phash/tytuł).
# Re-ingesty pod nowymi tytułami → dupy odrastają, stąd cykliczny job. 12h. 0 = off.
sched_thumb_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_THUMB_DEDUP_HOURS"
)
# Ingest freshness watchdog — alert do Sentry gdy aktywny browse-tube (origin
# tube:<sitetag>) przestał dawać nowe sceny > max_age_hours. Łapie zamrożenie
# pojedynczego origin, którego globalny monitor (jeden Source "tube-scraper") nie
# widzi (np. freshporno browse z rotującego roota, report 14f3a655). 6h cadence
# (po browse-latest), próg 48h. Każdy 0/None = wyłączony.
sched_ingest_watchdog_hours: int = Field(
default=6, validation_alias="GOON_SCHED_INGEST_WATCHDOG_HOURS"
)
ingest_watchdog_max_age_hours: int = Field(
default=48, validation_alias="GOON_INGEST_WATCHDOG_MAX_AGE_HOURS"
)
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
sched_taxonomy_counts_hours: int = Field(
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
# Alert thresholds (% of included_traffic) — Sentry severity levels.
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
# Bright Data ISP proxy (stałe IP od ISP, rozliczane ryczałtem NIE per-GB) —
# używany do ingestu HTML (scrape) tubów które blokują VPS IP twardym Cloudflare
# 403 nawet z browser-TLS (superporn). Streamu i tak nie ruszamy proxy (tokeny CDN
# IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak.
brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL")
@property
def brightdata_proxy_url(self) -> str | None:
"""`host:port:user:pass` → `http://user:pass@host:port` dla curl_cffi/httpx.
None gdy nieustawiony lub w złym formacie."""
parts = self.brightdata_proxy_raw.split(":")
if len(parts) != 4 or not all(parts):
return None
host, port, user, pwd = parts
return f"http://{user}:{pwd}@{host}:{port}"
@property
def api_keys(self) -> set[str]:
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
@property
def auth_enabled(self) -> bool:
return bool(self.api_keys)
@property
def allowed_app_sig_hashes(self) -> set[str]:
return {
h.strip().lower().replace(":", "")
for h in self.allowed_app_sig_hashes_raw.split(",")
if h.strip()
}
@property
def app_sig_check_enabled(self) -> bool:
return bool(self.allowed_app_sig_hashes)
@lru_cache
def get_settings() -> Settings:
return Settings()