"""ThePornDB REST connector. API: https://api.theporndb.net (auth: Bearer token) Lista scen: GET /scenes?per_page=200&page=N&date={YYYY-MM-DD} (delta filter) Format: {data: [...], meta: {current_page, last_page, per_page, total}} Sceny TPDB zwracają już rozwiniętych performerów (`performers[]`), studio (`site`) i tagi (`tags[]`). W związku z tym pojedyncze GET /scenes wystarcza do MVP — nie musimy uderzać oddzielnie po performera. Format performera w scenie: - performer.id — ID przypisania performer↔scene (NIE używać do dedup) - performer.name — imię w tej konkretnej scenie (może być alias, np. „Mia M.") - performer.parent.id — kanoniczne UUID performerki w TPDB → external_id - performer.parent.name / .extra.gender / .extra.birthday — kanoniczne metadane Format studia: scene.site = {id, name, slug, parent: {...}, network: {...}} """ from __future__ import annotations import logging from collections.abc import Iterator from datetime import date, datetime from typing import Any import httpx from tenacity import ( retry, retry_if_exception, retry_if_exception_type, stop_after_attempt, wait_exponential, ) def _is_retryable_http_error(exc: BaseException) -> bool: """Retry transport errors + 5xx + 429; NIE retry 4xx (404/422 = permanent). 401/403 NIE są retryowalne tutaj — TPDB token expiry musiałby być obsłużony jako auth refresh (TODO gdyby zaczęły się pojawiać). Aktualnie expire'a się raz na rok, więc nie warto kombinować. """ if isinstance(exc, httpx.TransportError): return True if isinstance(exc, httpx.HTTPStatusError): sc = exc.response.status_code return sc == 429 or sc >= 500 return False from app.config import get_settings from app.connectors.base import ( BaseConnector, RawPerformer, RawScene, RawStudio, RawTag, ) from app.models.source import SourceKind log = logging.getLogger(__name__) class TPDBConnector(BaseConnector): kind = SourceKind.tpdb name = "tpdb" def __init__( self, *, token: str | None = None, base_url: str | None = None, per_page: int = 100, timeout: float = 30.0, ) -> None: settings = get_settings() self.token = token or settings.tpdb_api_token if not self.token: raise RuntimeError("TPDB_API_TOKEN is not set") self.base_url = (base_url or settings.tpdb_base_url).rstrip("/") self.per_page = per_page self.timeout = timeout def _client(self) -> httpx.Client: return httpx.Client( base_url=self.base_url, headers={ "Authorization": f"Bearer {self.token}", "Accept": "application/json", "User-Agent": "goon/0.1", }, timeout=self.timeout, ) @retry( retry=retry_if_exception(_is_retryable_http_error), wait=wait_exponential(multiplier=1, min=2, max=30), stop=stop_after_attempt(5), reraise=True, ) def _get(self, client: httpx.Client, path: str, params: dict[str, Any]) -> dict[str, Any]: resp = client.get(path, params=params) if resp.status_code == 429: # let tenacity retry — but raise something it knows raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp) resp.raise_for_status() return resp.json() def fetch_scenes( self, *, since: datetime | None = None, limit: int | None = None, ) -> Iterator[RawScene]: params: dict[str, Any] = {"per_page": self.per_page} if since is not None: params["date"] = since.date().isoformat() yield from self._paginate_scenes(params, limit=limit) def fetch_scenes_for_performer( self, performer_external_id: str, *, limit: int | None = None, ) -> Iterator[RawScene]: """Pobiera wszystkie sceny TPDB dla performera o podanym kanonicznym ID. TPDB API: GET /performers//scenes — dedykowany endpoint. (Inne warianty są broken: /scenes?performers[]= zwraca zawsze total=0, /scenes?performer_id= → 422.) 404 = performer usunięty z TPDB (np. b959ccbb 2026-05-16 Sentry GOON-N). Wcześniej leciało raise → exception bąbelek do scheduler.performer_driven → cały run failed. Teraz warn + yield empty — caller widzi 0 scen i kontynuuje z następnym performer. """ try: yield from self._paginate_scenes( {"per_page": self.per_page}, limit=limit, path=f"/performers/{performer_external_id}/scenes", ) except httpx.HTTPStatusError as e: if e.response.status_code == 404: log.warning( "tpdb performer %s removed (404) — skipping", performer_external_id, ) return raise def fetch_scenes_for_site( self, site_external_id: str, *, limit: int | None = None, ) -> Iterator[RawScene]: """Pobiera wszystkie sceny TPDB dla site/studio o podanym ID. TPDB API: GET /sites//scenes — dedykowany endpoint analogiczny do /performers//scenes. Bez paginacji limit zwraca total scenes z meta.total (Brazzers=272, Naughty America=631 w czasie pisania). 404 = site usunięty z TPDB — analogicznie do fetch_scenes_for_performer. """ try: yield from self._paginate_scenes( {"per_page": self.per_page}, limit=limit, path=f"/sites/{site_external_id}/scenes", ) except httpx.HTTPStatusError as e: if e.response.status_code == 404: log.warning( "tpdb site %s removed (404) — skipping", site_external_id, ) return raise def find_performer_id_by_name(self, name: str) -> str | None: """GET /performers?q= → pierwszy match. None gdy brak.""" with self._client() as client: try: payload = self._get(client, "/performers", {"q": name, "per_page": 5}) except httpx.HTTPStatusError as e: log.warning("tpdb /performers q=%s failed: %s", name, e) return None data = payload.get("data") or [] if not data: return None for item in data: # exact (case-insensitive) match preferowany; fallback do pierwszego if (item.get("name") or "").strip().lower() == name.strip().lower(): return str(item.get("id")) if item.get("id") else None first = data[0] return str(first.get("id")) if first.get("id") else None def _paginate_scenes( self, params: dict[str, Any], *, limit: int | None, path: str = "/scenes", ) -> Iterator[RawScene]: emitted = 0 page = 1 with self._client() as client: while True: params["page"] = page payload = self._get(client, path, params) data = payload.get("data") or [] if not data: return for raw in data: scene = _parse_scene(raw) if scene is None: continue yield scene emitted += 1 if limit is not None and emitted >= limit: return meta = payload.get("meta") or {} last_page = meta.get("last_page") or page if page >= last_page: return page += 1 def _parse_date(value: Any) -> date | None: if not value: return None if isinstance(value, date): return value text = str(value).strip() if not text: return None # TPDB dates: "YYYY-MM-DD" lub ISO datetime try: return date.fromisoformat(text[:10]) except ValueError: return None def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None: if not raw: return None parent = raw.get("parent") or {} network = raw.get("network") or {} return RawStudio( external_id=str(raw["id"]) if raw.get("id") is not None else None, name=raw.get("name") or "Unknown", slug=raw.get("short_name") or raw.get("slug"), parent_external_id=str(parent["id"]) if parent.get("id") is not None else None, parent_name=parent.get("name"), network=network.get("name") if isinstance(network, dict) else None, homepage_url=raw.get("url") or raw.get("home"), ) def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None: parent = raw.get("parent") or {} extra = parent.get("extras") or parent.get("extra") or {} canonical_id = parent.get("id") or raw.get("id") canonical_name = parent.get("name") or raw.get("name") if not canonical_name: return None aliases_field = parent.get("aliases") or extra.get("aliases") or [] if isinstance(aliases_field, str): aliases = [a.strip() for a in aliases_field.split(",") if a.strip()] else: aliases = [a for a in aliases_field if isinstance(a, str)] return RawPerformer( external_id=str(canonical_id) if canonical_id is not None else None, name=canonical_name, aliases=aliases, gender=(extra.get("gender") or parent.get("gender") or "").lower() or None, birth_date=_parse_date(extra.get("birthday")), country=extra.get("birthplace") or extra.get("country"), as_alias_in_scene=raw.get("name") if raw.get("name") != canonical_name else None, ) def _parse_tag(raw: dict[str, Any]) -> RawTag | None: name = raw.get("name") if not name: return None return RawTag( external_id=str(raw["id"]) if raw.get("id") is not None else None, name=name, slug=raw.get("slug"), ) def _parse_scene(raw: dict[str, Any]) -> RawScene | None: external_id = raw.get("id") title = raw.get("title") if not external_id or not title: log.warning("tpdb scene without id/title — skipping (keys=%s)", list(raw)[:8]) return None performers: list[RawPerformer] = [] for p in raw.get("performers") or []: parsed = _parse_performer(p) if parsed is not None: performers.append(parsed) tags: list[RawTag] = [] for t in raw.get("tags") or []: parsed_t = _parse_tag(t) if parsed_t is not None: tags.append(parsed_t) return RawScene( external_id=str(external_id), title=title, description=raw.get("description"), release_date=_parse_date(raw.get("date")), duration_sec=int(raw["duration"]) if raw.get("duration") else None, code=raw.get("external_id"), director=raw.get("director"), url=raw.get("url"), studio=_parse_studio(raw.get("site")), performers=performers, tags=tags, fingerprints=[], # TPDB nie publikuje pHashy w głównym endpoint raw=raw, )