goon/app/connectors/tpdb.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

329 lines
11 KiB
Python

"""ThePornDB REST connector.
API: https://api.theporndb.net (auth: Bearer token)
Lista scen: GET /scenes?per_page=200&page=N&date={YYYY-MM-DD} (delta filter)
Format: {data: [...], meta: {current_page, last_page, per_page, total}}
Sceny TPDB zwracają już rozwiniętych performerów (`performers[]`), studio (`site`) i tagi (`tags[]`).
W związku z tym pojedyncze GET /scenes wystarcza do MVP — nie musimy uderzać oddzielnie po performera.
Format performera w scenie:
- performer.id — ID przypisania performer↔scene (NIE używać do dedup)
- performer.name — imię w tej konkretnej scenie (może być alias, np. „Mia M.")
- performer.parent.id — kanoniczne UUID performerki w TPDB → external_id
- performer.parent.name / .extra.gender / .extra.birthday — kanoniczne metadane
Format studia: scene.site = {id, name, slug, parent: {...}, network: {...}}
"""
from __future__ import annotations
import logging
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
import httpx
from tenacity import (
retry,
retry_if_exception,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
def _is_retryable_http_error(exc: BaseException) -> bool:
"""Retry transport errors + 5xx + 429; NIE retry 4xx (404/422 = permanent).
401/403 NIE są retryowalne tutaj — TPDB token expiry musiałby być
obsłużony jako auth refresh (TODO gdyby zaczęły się pojawiać). Aktualnie
expire'a się raz na rok, więc nie warto kombinować.
"""
if isinstance(exc, httpx.TransportError):
return True
if isinstance(exc, httpx.HTTPStatusError):
sc = exc.response.status_code
return sc == 429 or sc >= 500
return False
from app.config import get_settings
from app.connectors.base import (
BaseConnector,
RawPerformer,
RawScene,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
class TPDBConnector(BaseConnector):
kind = SourceKind.tpdb
name = "tpdb"
def __init__(
self,
*,
token: str | None = None,
base_url: str | None = None,
per_page: int = 100,
timeout: float = 30.0,
) -> None:
settings = get_settings()
self.token = token or settings.tpdb_api_token
if not self.token:
raise RuntimeError("TPDB_API_TOKEN is not set")
self.base_url = (base_url or settings.tpdb_base_url).rstrip("/")
self.per_page = per_page
self.timeout = timeout
def _client(self) -> httpx.Client:
return httpx.Client(
base_url=self.base_url,
headers={
"Authorization": f"Bearer {self.token}",
"Accept": "application/json",
"User-Agent": "goon/0.1",
},
timeout=self.timeout,
)
@retry(
retry=retry_if_exception(_is_retryable_http_error),
wait=wait_exponential(multiplier=1, min=2, max=30),
stop=stop_after_attempt(5),
reraise=True,
)
def _get(self, client: httpx.Client, path: str, params: dict[str, Any]) -> dict[str, Any]:
resp = client.get(path, params=params)
if resp.status_code == 429:
# let tenacity retry — but raise something it knows
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
resp.raise_for_status()
return resp.json()
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
params: dict[str, Any] = {"per_page": self.per_page}
if since is not None:
params["date"] = since.date().isoformat()
yield from self._paginate_scenes(params, limit=limit)
def fetch_scenes_for_performer(
self,
performer_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Pobiera wszystkie sceny TPDB dla performera o podanym kanonicznym ID.
TPDB API: GET /performers/<id>/scenes — dedykowany endpoint.
(Inne warianty są broken: /scenes?performers[]=<uuid> zwraca zawsze total=0,
/scenes?performer_id=<uuid> → 422.)
404 = performer usunięty z TPDB (np. b959ccbb 2026-05-16 Sentry GOON-N).
Wcześniej leciało raise → exception bąbelek do scheduler.performer_driven
→ cały run failed. Teraz warn + yield empty — caller widzi 0 scen i
kontynuuje z następnym performer.
"""
try:
yield from self._paginate_scenes(
{"per_page": self.per_page},
limit=limit,
path=f"/performers/{performer_external_id}/scenes",
)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
log.warning(
"tpdb performer %s removed (404) — skipping",
performer_external_id,
)
return
raise
def fetch_scenes_for_site(
self,
site_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Pobiera wszystkie sceny TPDB dla site/studio o podanym ID.
TPDB API: GET /sites/<id>/scenes — dedykowany endpoint analogiczny
do /performers/<id>/scenes. Bez paginacji limit zwraca total scenes
z meta.total (Brazzers=272, Naughty America=631 w czasie pisania).
404 = site usunięty z TPDB — analogicznie do fetch_scenes_for_performer.
"""
try:
yield from self._paginate_scenes(
{"per_page": self.per_page},
limit=limit,
path=f"/sites/{site_external_id}/scenes",
)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
log.warning(
"tpdb site %s removed (404) — skipping",
site_external_id,
)
return
raise
def find_performer_id_by_name(self, name: str) -> str | None:
"""GET /performers?q=<name> → pierwszy match. None gdy brak."""
with self._client() as client:
try:
payload = self._get(client, "/performers", {"q": name, "per_page": 5})
except httpx.HTTPStatusError as e:
log.warning("tpdb /performers q=%s failed: %s", name, e)
return None
data = payload.get("data") or []
if not data:
return None
for item in data:
# exact (case-insensitive) match preferowany; fallback do pierwszego
if (item.get("name") or "").strip().lower() == name.strip().lower():
return str(item.get("id")) if item.get("id") else None
first = data[0]
return str(first.get("id")) if first.get("id") else None
def _paginate_scenes(
self,
params: dict[str, Any],
*,
limit: int | None,
path: str = "/scenes",
) -> Iterator[RawScene]:
emitted = 0
page = 1
with self._client() as client:
while True:
params["page"] = page
payload = self._get(client, path, params)
data = payload.get("data") or []
if not data:
return
for raw in data:
scene = _parse_scene(raw)
if scene is None:
continue
yield scene
emitted += 1
if limit is not None and emitted >= limit:
return
meta = payload.get("meta") or {}
last_page = meta.get("last_page") or page
if page >= last_page:
return
page += 1
def _parse_date(value: Any) -> date | None:
if not value:
return None
if isinstance(value, date):
return value
text = str(value).strip()
if not text:
return None
# TPDB dates: "YYYY-MM-DD" lub ISO datetime
try:
return date.fromisoformat(text[:10])
except ValueError:
return None
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
if not raw:
return None
parent = raw.get("parent") or {}
network = raw.get("network") or {}
return RawStudio(
external_id=str(raw["id"]) if raw.get("id") is not None else None,
name=raw.get("name") or "Unknown",
slug=raw.get("short_name") or raw.get("slug"),
parent_external_id=str(parent["id"]) if parent.get("id") is not None else None,
parent_name=parent.get("name"),
network=network.get("name") if isinstance(network, dict) else None,
homepage_url=raw.get("url") or raw.get("home"),
)
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
parent = raw.get("parent") or {}
extra = parent.get("extras") or parent.get("extra") or {}
canonical_id = parent.get("id") or raw.get("id")
canonical_name = parent.get("name") or raw.get("name")
if not canonical_name:
return None
aliases_field = parent.get("aliases") or extra.get("aliases") or []
if isinstance(aliases_field, str):
aliases = [a.strip() for a in aliases_field.split(",") if a.strip()]
else:
aliases = [a for a in aliases_field if isinstance(a, str)]
return RawPerformer(
external_id=str(canonical_id) if canonical_id is not None else None,
name=canonical_name,
aliases=aliases,
gender=(extra.get("gender") or parent.get("gender") or "").lower() or None,
birth_date=_parse_date(extra.get("birthday")),
country=extra.get("birthplace") or extra.get("country"),
as_alias_in_scene=raw.get("name") if raw.get("name") != canonical_name else None,
)
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
name = raw.get("name")
if not name:
return None
return RawTag(
external_id=str(raw["id"]) if raw.get("id") is not None else None,
name=name,
slug=raw.get("slug"),
)
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
external_id = raw.get("id")
title = raw.get("title")
if not external_id or not title:
log.warning("tpdb scene without id/title — skipping (keys=%s)", list(raw)[:8])
return None
performers: list[RawPerformer] = []
for p in raw.get("performers") or []:
parsed = _parse_performer(p)
if parsed is not None:
performers.append(parsed)
tags: list[RawTag] = []
for t in raw.get("tags") or []:
parsed_t = _parse_tag(t)
if parsed_t is not None:
tags.append(parsed_t)
return RawScene(
external_id=str(external_id),
title=title,
description=raw.get("description"),
release_date=_parse_date(raw.get("date")),
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
code=raw.get("external_id"),
director=raw.get("director"),
url=raw.get("url"),
studio=_parse_studio(raw.get("site")),
performers=performers,
tags=tags,
fingerprints=[], # TPDB nie publikuje pHashy w głównym endpoint
raw=raw,
)