goon/app/config.py
jtrzupek c154deab37 feat(sources): 0-5★ ranking on Sites (freshness/metadata/plays) + playback telemetry
Rates each source on three axes the user asked for:
- freshness: how recently/often new content arrives (newest age + 7d volume)
- richness: metadata coverage (thumbnail/tags/performers/description/studio/duration)
- plays: does it actually play — from real playback telemetry when available,
  else a proxy from the resolve mechanism. 0★ = offline (gates the overall stars,
  so a fresh+rich source that doesn't play still ranks bottom — the hqfap/4k69 case)

Backend:
- playback_events: fire-and-forget telemetry POST from the app per playback attempt
  (origin + success/error + time-to-first-frame), append-only, 30d retention
- source_stats: per-origin computed scores, refreshed by a scheduler job (6h);
  /sources joins it and sorts by stars
- models + local migration 0025; new GOON_SCHED_SOURCE_STATS_HOURS setting

Mobile:
- Sites rows show ★ rating; tap the stars for a breakdown (axes + metadata %, plus
  whether "plays" is measured or estimated)
- PlayerScreen reports playback success/failure per source (native path only —
  symmetric, conservative); origin threaded through Scene/Movie play callsites

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-22 10:00:59 +02:00

208 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
database_url: str = Field(
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
validation_alias="DATABASE_URL",
)
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
tpdb_base_url: str = Field(
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
)
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
stashdb_graphql_url: str = Field(
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
)
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
# 5k errors/mies wystarczy dla 1-user app.
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
sentry_traces_sample_rate: float = Field(
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
)
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
Re-packaging APK innym keystorem zmienia hash → 403."""
auto_merge_threshold: float = 0.92
review_threshold: float = 0.75
fingerprint_hamming_max: int = 5
title_token_set_min: int = 88
date_window_days: int = 7
# Skip ingestu clip-store (ManyVids/IWantClips/Clips4Sale/...) z canonical source —
# to permanentne orphany (free tubes nie hostują), ~56% ingestu TPDB/StashDB.
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
# Minimalny duration sceny z tube/scraper przy ingescie — <N s = trailer/teaser/preview.
# 0 = wyłączony. Nieznany duration nie jest wycinany. NIE dotyczy canonical (TPDB/StashDB).
min_ingest_duration_sec: int = Field(default=180, validation_alias="GOON_MIN_INGEST_DURATION_SEC")
# APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
sched_performer_driven_hours: int = Field(
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
)
sched_performer_driven_top_n: int = Field(
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
)
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
sched_performer_continuous_seconds: int = Field(
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
)
sched_performer_continuous_refresh_days: int = Field(
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
)
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
)
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
sched_browse_latest_hours: int = Field(
default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
)
sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
)
# Deep-crawl (Faza 2a) — pełne katalogi browse-tube'ów (porndoe ~62k itd.), nie tylko
# top-N. Round-robin po tube'ach, wznawialny kursor (app/_state/deepcrawl_state.json).
# 0 = wyłączony. 60 stron/run × ~31 scen ≈ 1860 scen/run (~22 min, hard-timeout 1h).
sched_deep_crawl_hours: int = Field(default=1, validation_alias="GOON_SCHED_DEEP_CRAWL_HOURS")
deep_crawl_pages_per_run: int = Field(default=60, validation_alias="GOON_DEEP_CRAWL_PAGES_PER_RUN")
deepcrawl_state_path: str = Field(default="", validation_alias="GOON_DEEPCRAWL_STATE_PATH")
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Thumb-asset dedup — scala dupy hdporn.gg/fullmovies.xxx (ten sam film, różne tytuły,
# ten sam asset-id miniatury + długość). bulk_dedup tego nie łapie (brak phash/tytuł).
# Re-ingesty pod nowymi tytułami → dupy odrastają, stąd cykliczny job. 12h. 0 = off.
sched_thumb_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_THUMB_DEDUP_HOURS"
)
# Title+duration dedup — scala missing-merge dupy (ten sam performer + identyczny
# znormalizowany tytuł + długość co do sekundy), których bulk_dedup nie łapie (tube
# re-scrape / cross-tube np. porn00 vs xnxx, reports 28fe8181/32df33b1). Odrastają
# przy re-ingeście, stąd cyklicznie. 12h, playback-only (to co user widzi). 0 = off.
sched_title_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_TITLE_DEDUP_HOURS"
)
# Ingest freshness watchdog — alert do Sentry gdy aktywny tube (origin
# tube:<sitetag>) przestał dawać nowe sceny > próg. Łapie zamrożenie
# pojedynczego origin, którego globalny monitor (jeden Source "tube-scraper") nie
# widzi (np. freshporno browse z rotującego roota, report 14f3a655). 6h cadence
# (po browse-latest). Każdy 0/None = wyłączony.
sched_ingest_watchdog_hours: int = Field(
default=6, validation_alias="GOON_SCHED_INGEST_WATCHDOG_HOURS"
)
# Próg dla browse-scraperów (ALL_BROWSE_SCRAPERS) — crawlowane raz dziennie z
# listingu, więc 48h ciszy = anomalia.
ingest_watchdog_max_age_hours: int = Field(
default=48, validation_alias="GOON_INGEST_WATCHDOG_MAX_AGE_HOURS"
)
# Próg dla performer-driven search-scraperów (ALL_DIRECT_SCRAPERS) — kadencja jest
# nierówna (continuous queue ~30d refresh per performer, ingest orphan-heavy), więc
# 48h dawałoby false-positivy. 7d (168h): healthy search-tuby obserwowane <6h świeżości
# (continuous tick hituje wszystkie tuby per performer), zamrożone ≥73h → ~28× margines.
ingest_watchdog_search_max_age_hours: int = Field(
default=168, validation_alias="GOON_INGEST_WATCHDOG_SEARCH_MAX_AGE_HOURS"
)
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
sched_taxonomy_counts_hours: int = Field(
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
# Alert thresholds (% of included_traffic) — Sentry severity levels.
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
# Cadence sprawdzania transferu (godziny). 0/None = monitor wyłączony. Domyślnie 6h
# (transfer rośnie wolno; częściej bez sensu). Działa tylko gdy ustawiony token+id.
sched_hetzner_monitor_hours: int = Field(
default=6, validation_alias="GOON_SCHED_HETZNER_MONITOR_HOURS"
)
# Source ranking (Sites screen) — przelicz source_stats (freshness/richness/health
# per origin). 0/None = wyłączone. Domyślnie 6h (richness to ciężki agregat po
# ~2M live playback_sources; częściej bez sensu, dane zmieniają się powoli).
sched_source_stats_hours: int = Field(
default=6, validation_alias="GOON_SCHED_SOURCE_STATS_HOURS"
)
# Bright Data ISP proxy (stałe IP od ISP, rozliczane ryczałtem NIE per-GB) —
# używany do ingestu HTML (scrape) tubów które blokują VPS IP twardym Cloudflare
# 403 nawet z browser-TLS (superporn). Streamu i tak nie ruszamy proxy (tokeny CDN
# IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak.
brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL")
@property
def brightdata_proxy_url(self) -> str | None:
"""`host:port:user:pass` → `http://user:pass@host:port` dla curl_cffi/httpx.
None gdy nieustawiony lub w złym formacie."""
parts = self.brightdata_proxy_raw.split(":")
if len(parts) != 4 or not all(parts):
return None
host, port, user, pwd = parts
return f"http://{user}:{pwd}@{host}:{port}"
@property
def api_keys(self) -> set[str]:
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
@property
def auth_enabled(self) -> bool:
return bool(self.api_keys)
@property
def allowed_app_sig_hashes(self) -> set[str]:
return {
h.strip().lower().replace(":", "")
for h in self.allowed_app_sig_hashes_raw.split(",")
if h.strip()
}
@property
def app_sig_check_enabled(self) -> bool:
return bool(self.allowed_app_sig_hashes)
@lru_cache
def get_settings() -> Settings:
return Settings()