Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
329 lines
11 KiB
Python
329 lines
11 KiB
Python
"""ThePornDB REST connector.
|
|
|
|
API: https://api.theporndb.net (auth: Bearer token)
|
|
Lista scen: GET /scenes?per_page=200&page=N&date={YYYY-MM-DD} (delta filter)
|
|
Format: {data: [...], meta: {current_page, last_page, per_page, total}}
|
|
|
|
Sceny TPDB zwracają już rozwiniętych performerów (`performers[]`), studio (`site`) i tagi (`tags[]`).
|
|
W związku z tym pojedyncze GET /scenes wystarcza do MVP — nie musimy uderzać oddzielnie po performera.
|
|
|
|
Format performera w scenie:
|
|
- performer.id — ID przypisania performer↔scene (NIE używać do dedup)
|
|
- performer.name — imię w tej konkretnej scenie (może być alias, np. „Mia M.")
|
|
- performer.parent.id — kanoniczne UUID performerki w TPDB → external_id
|
|
- performer.parent.name / .extra.gender / .extra.birthday — kanoniczne metadane
|
|
|
|
Format studia: scene.site = {id, name, slug, parent: {...}, network: {...}}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from collections.abc import Iterator
|
|
from datetime import date, datetime
|
|
from typing import Any
|
|
|
|
import httpx
|
|
from tenacity import (
|
|
retry,
|
|
retry_if_exception,
|
|
retry_if_exception_type,
|
|
stop_after_attempt,
|
|
wait_exponential,
|
|
)
|
|
|
|
|
|
def _is_retryable_http_error(exc: BaseException) -> bool:
|
|
"""Retry transport errors + 5xx + 429; NIE retry 4xx (404/422 = permanent).
|
|
|
|
401/403 NIE są retryowalne tutaj — TPDB token expiry musiałby być
|
|
obsłużony jako auth refresh (TODO gdyby zaczęły się pojawiać). Aktualnie
|
|
expire'a się raz na rok, więc nie warto kombinować.
|
|
"""
|
|
if isinstance(exc, httpx.TransportError):
|
|
return True
|
|
if isinstance(exc, httpx.HTTPStatusError):
|
|
sc = exc.response.status_code
|
|
return sc == 429 or sc >= 500
|
|
return False
|
|
|
|
from app.config import get_settings
|
|
from app.connectors.base import (
|
|
BaseConnector,
|
|
RawPerformer,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.models.source import SourceKind
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class TPDBConnector(BaseConnector):
|
|
kind = SourceKind.tpdb
|
|
name = "tpdb"
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
token: str | None = None,
|
|
base_url: str | None = None,
|
|
per_page: int = 100,
|
|
timeout: float = 30.0,
|
|
) -> None:
|
|
settings = get_settings()
|
|
self.token = token or settings.tpdb_api_token
|
|
if not self.token:
|
|
raise RuntimeError("TPDB_API_TOKEN is not set")
|
|
self.base_url = (base_url or settings.tpdb_base_url).rstrip("/")
|
|
self.per_page = per_page
|
|
self.timeout = timeout
|
|
|
|
def _client(self) -> httpx.Client:
|
|
return httpx.Client(
|
|
base_url=self.base_url,
|
|
headers={
|
|
"Authorization": f"Bearer {self.token}",
|
|
"Accept": "application/json",
|
|
"User-Agent": "goon/0.1",
|
|
},
|
|
timeout=self.timeout,
|
|
)
|
|
|
|
@retry(
|
|
retry=retry_if_exception(_is_retryable_http_error),
|
|
wait=wait_exponential(multiplier=1, min=2, max=30),
|
|
stop=stop_after_attempt(5),
|
|
reraise=True,
|
|
)
|
|
def _get(self, client: httpx.Client, path: str, params: dict[str, Any]) -> dict[str, Any]:
|
|
resp = client.get(path, params=params)
|
|
if resp.status_code == 429:
|
|
# let tenacity retry — but raise something it knows
|
|
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
def fetch_scenes(
|
|
self,
|
|
*,
|
|
since: datetime | None = None,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
params: dict[str, Any] = {"per_page": self.per_page}
|
|
if since is not None:
|
|
params["date"] = since.date().isoformat()
|
|
|
|
yield from self._paginate_scenes(params, limit=limit)
|
|
|
|
def fetch_scenes_for_performer(
|
|
self,
|
|
performer_external_id: str,
|
|
*,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
"""Pobiera wszystkie sceny TPDB dla performera o podanym kanonicznym ID.
|
|
|
|
TPDB API: GET /performers/<id>/scenes — dedykowany endpoint.
|
|
(Inne warianty są broken: /scenes?performers[]=<uuid> zwraca zawsze total=0,
|
|
/scenes?performer_id=<uuid> → 422.)
|
|
|
|
404 = performer usunięty z TPDB (np. b959ccbb 2026-05-16 Sentry GOON-N).
|
|
Wcześniej leciało raise → exception bąbelek do scheduler.performer_driven
|
|
→ cały run failed. Teraz warn + yield empty — caller widzi 0 scen i
|
|
kontynuuje z następnym performer.
|
|
"""
|
|
try:
|
|
yield from self._paginate_scenes(
|
|
{"per_page": self.per_page},
|
|
limit=limit,
|
|
path=f"/performers/{performer_external_id}/scenes",
|
|
)
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 404:
|
|
log.warning(
|
|
"tpdb performer %s removed (404) — skipping",
|
|
performer_external_id,
|
|
)
|
|
return
|
|
raise
|
|
|
|
def fetch_scenes_for_site(
|
|
self,
|
|
site_external_id: str,
|
|
*,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
"""Pobiera wszystkie sceny TPDB dla site/studio o podanym ID.
|
|
|
|
TPDB API: GET /sites/<id>/scenes — dedykowany endpoint analogiczny
|
|
do /performers/<id>/scenes. Bez paginacji limit zwraca total scenes
|
|
z meta.total (Brazzers=272, Naughty America=631 w czasie pisania).
|
|
|
|
404 = site usunięty z TPDB — analogicznie do fetch_scenes_for_performer.
|
|
"""
|
|
try:
|
|
yield from self._paginate_scenes(
|
|
{"per_page": self.per_page},
|
|
limit=limit,
|
|
path=f"/sites/{site_external_id}/scenes",
|
|
)
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 404:
|
|
log.warning(
|
|
"tpdb site %s removed (404) — skipping",
|
|
site_external_id,
|
|
)
|
|
return
|
|
raise
|
|
|
|
def find_performer_id_by_name(self, name: str) -> str | None:
|
|
"""GET /performers?q=<name> → pierwszy match. None gdy brak."""
|
|
with self._client() as client:
|
|
try:
|
|
payload = self._get(client, "/performers", {"q": name, "per_page": 5})
|
|
except httpx.HTTPStatusError as e:
|
|
log.warning("tpdb /performers q=%s failed: %s", name, e)
|
|
return None
|
|
data = payload.get("data") or []
|
|
if not data:
|
|
return None
|
|
for item in data:
|
|
# exact (case-insensitive) match preferowany; fallback do pierwszego
|
|
if (item.get("name") or "").strip().lower() == name.strip().lower():
|
|
return str(item.get("id")) if item.get("id") else None
|
|
first = data[0]
|
|
return str(first.get("id")) if first.get("id") else None
|
|
|
|
def _paginate_scenes(
|
|
self,
|
|
params: dict[str, Any],
|
|
*,
|
|
limit: int | None,
|
|
path: str = "/scenes",
|
|
) -> Iterator[RawScene]:
|
|
emitted = 0
|
|
page = 1
|
|
with self._client() as client:
|
|
while True:
|
|
params["page"] = page
|
|
payload = self._get(client, path, params)
|
|
data = payload.get("data") or []
|
|
if not data:
|
|
return
|
|
for raw in data:
|
|
scene = _parse_scene(raw)
|
|
if scene is None:
|
|
continue
|
|
yield scene
|
|
emitted += 1
|
|
if limit is not None and emitted >= limit:
|
|
return
|
|
|
|
meta = payload.get("meta") or {}
|
|
last_page = meta.get("last_page") or page
|
|
if page >= last_page:
|
|
return
|
|
page += 1
|
|
|
|
|
|
def _parse_date(value: Any) -> date | None:
|
|
if not value:
|
|
return None
|
|
if isinstance(value, date):
|
|
return value
|
|
text = str(value).strip()
|
|
if not text:
|
|
return None
|
|
# TPDB dates: "YYYY-MM-DD" lub ISO datetime
|
|
try:
|
|
return date.fromisoformat(text[:10])
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
|
|
if not raw:
|
|
return None
|
|
parent = raw.get("parent") or {}
|
|
network = raw.get("network") or {}
|
|
return RawStudio(
|
|
external_id=str(raw["id"]) if raw.get("id") is not None else None,
|
|
name=raw.get("name") or "Unknown",
|
|
slug=raw.get("short_name") or raw.get("slug"),
|
|
parent_external_id=str(parent["id"]) if parent.get("id") is not None else None,
|
|
parent_name=parent.get("name"),
|
|
network=network.get("name") if isinstance(network, dict) else None,
|
|
homepage_url=raw.get("url") or raw.get("home"),
|
|
)
|
|
|
|
|
|
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
|
|
parent = raw.get("parent") or {}
|
|
extra = parent.get("extras") or parent.get("extra") or {}
|
|
canonical_id = parent.get("id") or raw.get("id")
|
|
canonical_name = parent.get("name") or raw.get("name")
|
|
if not canonical_name:
|
|
return None
|
|
aliases_field = parent.get("aliases") or extra.get("aliases") or []
|
|
if isinstance(aliases_field, str):
|
|
aliases = [a.strip() for a in aliases_field.split(",") if a.strip()]
|
|
else:
|
|
aliases = [a for a in aliases_field if isinstance(a, str)]
|
|
return RawPerformer(
|
|
external_id=str(canonical_id) if canonical_id is not None else None,
|
|
name=canonical_name,
|
|
aliases=aliases,
|
|
gender=(extra.get("gender") or parent.get("gender") or "").lower() or None,
|
|
birth_date=_parse_date(extra.get("birthday")),
|
|
country=extra.get("birthplace") or extra.get("country"),
|
|
as_alias_in_scene=raw.get("name") if raw.get("name") != canonical_name else None,
|
|
)
|
|
|
|
|
|
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
|
|
name = raw.get("name")
|
|
if not name:
|
|
return None
|
|
return RawTag(
|
|
external_id=str(raw["id"]) if raw.get("id") is not None else None,
|
|
name=name,
|
|
slug=raw.get("slug"),
|
|
)
|
|
|
|
|
|
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
|
|
external_id = raw.get("id")
|
|
title = raw.get("title")
|
|
if not external_id or not title:
|
|
log.warning("tpdb scene without id/title — skipping (keys=%s)", list(raw)[:8])
|
|
return None
|
|
|
|
performers: list[RawPerformer] = []
|
|
for p in raw.get("performers") or []:
|
|
parsed = _parse_performer(p)
|
|
if parsed is not None:
|
|
performers.append(parsed)
|
|
|
|
tags: list[RawTag] = []
|
|
for t in raw.get("tags") or []:
|
|
parsed_t = _parse_tag(t)
|
|
if parsed_t is not None:
|
|
tags.append(parsed_t)
|
|
|
|
|
|
return RawScene(
|
|
external_id=str(external_id),
|
|
title=title,
|
|
description=raw.get("description"),
|
|
release_date=_parse_date(raw.get("date")),
|
|
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
|
|
code=raw.get("external_id"),
|
|
director=raw.get("director"),
|
|
url=raw.get("url"),
|
|
studio=_parse_studio(raw.get("site")),
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=[], # TPDB nie publikuje pHashy w głównym endpoint
|
|
raw=raw,
|
|
)
|