goon/app/config.py
jtrzupek 21bc8bf1fe feat(superporn): browse scraper via Bright Data residential proxy
superporn hard-blocks the VPS IP with Cloudflare 403 on every TLS
impersonation, so HTML ingest routes through Bright Data residential
(BRIGHTDATA_PROXY_URL, parsed in config). First scraper to use a proxy:
optional _proxy on the browse base, threaded into browser_get.

JSON-LD VideoObject (title/desc/uploadDate/thumb/duration) + pornstar
and category chips; superporn double-encodes HTML entities so titles
are unescaped twice. Thumbnails fetch fine from the VPS (no proxy).

Playback stays off-proxy: the <source> mp4 token is IP-bound to the
fetcher, so resolve is phone-side via WebView (extractor superporncom
-> _vps_blocked_fallback), same as porndoe.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 18:47:45 +02:00

163 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
database_url: str = Field(
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
validation_alias="DATABASE_URL",
)
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
tpdb_base_url: str = Field(
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
)
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
stashdb_graphql_url: str = Field(
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
)
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
# 5k errors/mies wystarczy dla 1-user app.
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
sentry_traces_sample_rate: float = Field(
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
)
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
Re-packaging APK innym keystorem zmienia hash → 403."""
auto_merge_threshold: float = 0.92
review_threshold: float = 0.75
fingerprint_hamming_max: int = 5
title_token_set_min: int = 88
date_window_days: int = 7
# Skip ingestu clip-store (ManyVids/IWantClips/Clips4Sale/...) z canonical source —
# to permanentne orphany (free tubes nie hostują), ~56% ingestu TPDB/StashDB.
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
# Minimalny duration sceny z tube/scraper przy ingescie — <N s = trailer/teaser/preview.
# 0 = wyłączony. Nieznany duration nie jest wycinany. NIE dotyczy canonical (TPDB/StashDB).
min_ingest_duration_sec: int = Field(default=180, validation_alias="GOON_MIN_INGEST_DURATION_SEC")
# APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
sched_performer_driven_hours: int = Field(
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
)
sched_performer_driven_top_n: int = Field(
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
)
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
sched_performer_continuous_seconds: int = Field(
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
)
sched_performer_continuous_refresh_days: int = Field(
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
)
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
)
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
sched_browse_latest_hours: int = Field(
default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
)
sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
)
# Deep-crawl (Faza 2a) — pełne katalogi browse-tube'ów (porndoe ~62k itd.), nie tylko
# top-N. Round-robin po tube'ach, wznawialny kursor (app/_state/deepcrawl_state.json).
# 0 = wyłączony. 60 stron/run × ~31 scen ≈ 1860 scen/run (~22 min, hard-timeout 1h).
sched_deep_crawl_hours: int = Field(default=1, validation_alias="GOON_SCHED_DEEP_CRAWL_HOURS")
deep_crawl_pages_per_run: int = Field(default=60, validation_alias="GOON_DEEP_CRAWL_PAGES_PER_RUN")
deepcrawl_state_path: str = Field(default="", validation_alias="GOON_DEEPCRAWL_STATE_PATH")
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
sched_taxonomy_counts_hours: int = Field(
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
# Alert thresholds (% of included_traffic) — Sentry severity levels.
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
# Bright Data residential proxy — używany TYLKO do ingestu HTML (scrape) tubów
# które blokują VPS IP twardym Cloudflare 403 nawet z browser-TLS (superporn).
# NIE do streamowania wideo (transfer leciałby przez płatne proxy + tokeny i tak
# IP-bound). Format env: `host:port:user:pass` (panel Bright Data). Pusty = brak.
brightdata_proxy_raw: str = Field(default="", validation_alias="BRIGHTDATA_PROXY_URL")
@property
def brightdata_proxy_url(self) -> str | None:
"""`host:port:user:pass` → `http://user:pass@host:port` dla curl_cffi/httpx.
None gdy nieustawiony lub w złym formacie."""
parts = self.brightdata_proxy_raw.split(":")
if len(parts) != 4 or not all(parts):
return None
host, port, user, pwd = parts
return f"http://{user}:{pwd}@{host}:{port}"
@property
def api_keys(self) -> set[str]:
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
@property
def auth_enabled(self) -> bool:
return bool(self.api_keys)
@property
def allowed_app_sig_hashes(self) -> set[str]:
return {
h.strip().lower().replace(":", "")
for h in self.allowed_app_sig_hashes_raw.split(",")
if h.strip()
}
@property
def app_sig_check_enabled(self) -> bool:
return bool(self.allowed_app_sig_hashes)
@lru_cache
def get_settings() -> Settings:
return Settings()