goon/app/config.py
jtrzupek 7e46e5ac48 feat(scheduler): deep-crawl full tube catalogs (Phase 2a — ingest-all)
We ingested only ~3% of each browse tube's catalog (porndoe >62k scenes; we had 1959)
because tubes were hit only by performer-search + top-N browse. Pilot (porndoe pages
64-110): 1119 new scenes, 100% playable + 100% tagged, 0% canonical overlap (purely
additive — content not in TPDB/StashDB).

- app/scheduler/deep_crawl.py: round-robin over ALL_BROWSE_SCRAPERS, per-tube page cursor
  in app/_state/deepcrawl_state.json (no DB migration), deep-paginate from the cursor,
  idempotent (resolver skips known by raw_hash), mark 'exhausted' at catalog end then
  reset cursors for an incremental re-sweep.
- _job_deep_crawl: hourly, 60 pages/run (~1860 scenes, ~22 min), wrapped in the 1h
  hard-timeout; registered in build_scheduler (jobs=10).
- config: sched_deep_crawl_hours=1, deep_crawl_pages_per_run=60, deepcrawl_state_path.
- scripts/pilot_porndoe_deepcrawl.py: one-off pilot used to validate the approach.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 09:26:44 +02:00

143 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
database_url: str = Field(
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
validation_alias="DATABASE_URL",
)
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
tpdb_base_url: str = Field(
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
)
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
stashdb_graphql_url: str = Field(
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
)
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
# 5k errors/mies wystarczy dla 1-user app.
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
sentry_traces_sample_rate: float = Field(
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
)
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
Re-packaging APK innym keystorem zmienia hash → 403."""
auto_merge_threshold: float = 0.92
review_threshold: float = 0.75
fingerprint_hamming_max: int = 5
title_token_set_min: int = 88
date_window_days: int = 7
# Skip ingestu clip-store (ManyVids/IWantClips/Clips4Sale/...) z canonical source —
# to permanentne orphany (free tubes nie hostują), ~56% ingestu TPDB/StashDB.
# False = wciągaj jak dawniej. Tube'y z clip-store studiem NIE są skipowane (mają playback).
skip_clip_store: bool = Field(default=True, validation_alias="GOON_SKIP_CLIP_STORE")
# APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
sched_performer_driven_hours: int = Field(
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
)
sched_performer_driven_top_n: int = Field(
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
)
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
sched_performer_continuous_seconds: int = Field(
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
)
sched_performer_continuous_refresh_days: int = Field(
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
)
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
)
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
sched_browse_latest_hours: int = Field(
default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
)
sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
)
# Deep-crawl (Faza 2a) — pełne katalogi browse-tube'ów (porndoe ~62k itd.), nie tylko
# top-N. Round-robin po tube'ach, wznawialny kursor (app/_state/deepcrawl_state.json).
# 0 = wyłączony. 60 stron/run × ~31 scen ≈ 1860 scen/run (~22 min, hard-timeout 1h).
sched_deep_crawl_hours: int = Field(default=1, validation_alias="GOON_SCHED_DEEP_CRAWL_HOURS")
deep_crawl_pages_per_run: int = Field(default=60, validation_alias="GOON_DEEP_CRAWL_PAGES_PER_RUN")
deepcrawl_state_path: str = Field(default="", validation_alias="GOON_DEEPCRAWL_STATE_PATH")
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
sched_taxonomy_counts_hours: int = Field(
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
# Alert thresholds (% of included_traffic) — Sentry severity levels.
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
@property
def api_keys(self) -> set[str]:
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
@property
def auth_enabled(self) -> bool:
return bool(self.api_keys)
@property
def allowed_app_sig_hashes(self) -> set[str]:
return {
h.strip().lower().replace(":", "")
for h in self.allowed_app_sig_hashes_raw.split(",")
if h.strip()
}
@property
def app_sig_check_enabled(self) -> bool:
return bool(self.allowed_app_sig_hashes)
@lru_cache
def get_settings() -> Settings:
return Settings()