Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
405 lines
12 KiB
Python
405 lines
12 KiB
Python
"""StashDB GraphQL connector.
|
|
|
|
Endpoint: https://stashdb.org/graphql (auth: header `ApiKey: <key>`)
|
|
|
|
Query używamy `queryScenes(input: {sort, direction, page, per_page})`. StashDB nie udostępnia
|
|
typowego date-since filtra w SceneQueryInput, więc deltę robimy klient-side: sortujemy po
|
|
UPDATED_AT DESC i przerywamy gdy `updated < since`.
|
|
|
|
Schema fields kluczowe (wg https://github.com/stashapp/stash-box/blob/master/graphql/schema/schema.graphql):
|
|
Scene { id title details date duration director code urls{url site{name}}
|
|
studio{id name parent{id name}}
|
|
performers{ as performer{ id name aliases gender birthdate{date} country } }
|
|
tags{ id name }
|
|
fingerprints{ hash algorithm duration } }
|
|
|
|
Cross-reference do TPDB: `urls[].site.name` zwykle zawiera "ThePornDB" + URL z UUID
|
|
(format: https://theporndb.net/scenes/<uuid>). Wyciągamy ten UUID jako tpdb cross-ref;
|
|
ingest_orchestrator może go potem użyć do path 2 (cross-source UUID).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from collections.abc import Iterator
|
|
from datetime import UTC, date, datetime
|
|
from typing import Any
|
|
|
|
import httpx
|
|
from tenacity import (
|
|
retry,
|
|
retry_if_exception_type,
|
|
stop_after_attempt,
|
|
wait_exponential,
|
|
)
|
|
|
|
from app.config import get_settings
|
|
from app.connectors.base import (
|
|
BaseConnector,
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.models.source import SourceKind
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
SCENES_QUERY = """
|
|
query QScenes($input: SceneQueryInput!) {
|
|
queryScenes(input: $input) {
|
|
count
|
|
scenes {
|
|
id
|
|
title
|
|
details
|
|
release_date
|
|
date
|
|
duration
|
|
director
|
|
code
|
|
updated
|
|
urls { url site { name } }
|
|
studio {
|
|
id name
|
|
parent { id name }
|
|
}
|
|
performers {
|
|
as
|
|
performer {
|
|
id
|
|
name
|
|
aliases
|
|
gender
|
|
birthdate { date }
|
|
country
|
|
}
|
|
}
|
|
tags { id name }
|
|
fingerprints { hash algorithm duration }
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
|
|
# UUID v4-ish pattern (relaxed)
|
|
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I)
|
|
|
|
|
|
class StashDBConnector(BaseConnector):
|
|
kind = SourceKind.stashdb
|
|
name = "stashdb"
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
api_key: str | None = None,
|
|
url: str | None = None,
|
|
per_page: int = 100,
|
|
timeout: float = 30.0,
|
|
) -> None:
|
|
settings = get_settings()
|
|
self.api_key = api_key or settings.stashdb_api_key
|
|
if not self.api_key:
|
|
raise RuntimeError("STASHDB_API_KEY is not set")
|
|
self.url = url or settings.stashdb_graphql_url
|
|
self.per_page = per_page
|
|
self.timeout = timeout
|
|
|
|
def _client(self) -> httpx.Client:
|
|
return httpx.Client(
|
|
headers={
|
|
"ApiKey": self.api_key,
|
|
"Accept": "application/json",
|
|
"Content-Type": "application/json",
|
|
"User-Agent": "goon/0.1",
|
|
},
|
|
timeout=self.timeout,
|
|
)
|
|
|
|
@retry(
|
|
retry=retry_if_exception_type((httpx.TransportError, httpx.HTTPStatusError)),
|
|
wait=wait_exponential(multiplier=1, min=2, max=30),
|
|
stop=stop_after_attempt(5),
|
|
reraise=True,
|
|
)
|
|
def _post(self, client: httpx.Client, payload: dict[str, Any]) -> dict[str, Any]:
|
|
resp = client.post(self.url, json=payload)
|
|
if resp.status_code == 429:
|
|
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
|
|
resp.raise_for_status()
|
|
body = resp.json()
|
|
if errors := body.get("errors"):
|
|
raise RuntimeError(f"stashdb graphql errors: {errors}")
|
|
return body["data"]
|
|
|
|
def fetch_scenes(
|
|
self,
|
|
*,
|
|
since: datetime | None = None,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
yield from self._paginate(
|
|
extra_input={"sort": "UPDATED_AT", "direction": "DESC"},
|
|
since=since,
|
|
limit=limit,
|
|
)
|
|
|
|
def find_performer_id_by_name(self, name: str) -> str | None:
|
|
"""queryPerformers(input: {name: <name>}) → pierwszy result.
|
|
|
|
StashDB GraphQL `name` to filter substring (case-insensitive). Zwracamy id
|
|
performera o exact match (case-insensitive) jeśli jest, inaczej pierwszy z listy.
|
|
"""
|
|
query = (
|
|
"query QPerformers($input: PerformerQueryInput!) {"
|
|
" queryPerformers(input: $input) { performers { id name } }"
|
|
"}"
|
|
)
|
|
variables = {"input": {"name": name, "per_page": 5}}
|
|
with self._client() as client:
|
|
try:
|
|
data = self._post(client, {"query": query, "variables": variables})
|
|
except Exception as e:
|
|
log.warning("stashdb queryPerformers name=%s failed: %s", name, e)
|
|
return None
|
|
performers = (data.get("queryPerformers") or {}).get("performers") or []
|
|
if not performers:
|
|
return None
|
|
target = name.strip().lower()
|
|
for p in performers:
|
|
if (p.get("name") or "").strip().lower() == target:
|
|
return p.get("id")
|
|
return performers[0].get("id")
|
|
|
|
def fetch_scenes_for_performer(
|
|
self,
|
|
performer_external_id: str,
|
|
*,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
"""Wszystkie sceny StashDB dla performera o podanym kanonicznym UUID.
|
|
|
|
StashDB SceneQueryInput.performers = MultiIDCriterionInput { value, modifier }.
|
|
Modifier INCLUDES = scena ma WSZYSTKIE wymienione UUID-y; przy 1 UUID = po prostu
|
|
sceny tego performera.
|
|
"""
|
|
yield from self._paginate(
|
|
extra_input={
|
|
"performers": {
|
|
"value": [performer_external_id],
|
|
"modifier": "INCLUDES",
|
|
},
|
|
"sort": "DATE",
|
|
"direction": "DESC",
|
|
},
|
|
since=None, # przy performer-scoped pull bierzemy całą historię
|
|
limit=limit,
|
|
)
|
|
|
|
def fetch_scenes_for_studio(
|
|
self,
|
|
studio_external_id: str,
|
|
*,
|
|
limit: int | None = None,
|
|
) -> Iterator[RawScene]:
|
|
"""Wszystkie sceny StashDB dla studio o podanym kanonicznym UUID.
|
|
|
|
Analogiczne do fetch_scenes_for_performer ale `studios` zamiast `performers`.
|
|
StashDB SceneQueryInput.studios = MultiIDCriterionInput { value, modifier }.
|
|
"""
|
|
yield from self._paginate(
|
|
extra_input={
|
|
"studios": {
|
|
"value": [studio_external_id],
|
|
"modifier": "INCLUDES",
|
|
},
|
|
"sort": "DATE",
|
|
"direction": "DESC",
|
|
},
|
|
since=None,
|
|
limit=limit,
|
|
)
|
|
|
|
def _paginate(
|
|
self,
|
|
*,
|
|
extra_input: dict[str, Any],
|
|
since: datetime | None,
|
|
limit: int | None,
|
|
) -> Iterator[RawScene]:
|
|
emitted = 0
|
|
page = 1
|
|
with self._client() as client:
|
|
while True:
|
|
variables = {
|
|
"input": {
|
|
"page": page,
|
|
"per_page": self.per_page,
|
|
**extra_input,
|
|
}
|
|
}
|
|
data = self._post(client, {"query": SCENES_QUERY, "variables": variables})
|
|
payload = data.get("queryScenes") or {}
|
|
scenes = payload.get("scenes") or []
|
|
if not scenes:
|
|
return
|
|
|
|
for raw in scenes:
|
|
if since is not None and _updated_before(raw, since):
|
|
return
|
|
parsed = _parse_scene(raw)
|
|
if parsed is None:
|
|
continue
|
|
yield parsed
|
|
emitted += 1
|
|
if limit is not None and emitted >= limit:
|
|
return
|
|
|
|
if len(scenes) < self.per_page:
|
|
return
|
|
page += 1
|
|
|
|
|
|
def _updated_before(raw: dict[str, Any], since: datetime) -> bool:
|
|
upd = raw.get("updated")
|
|
if not upd:
|
|
return False
|
|
try:
|
|
ts = datetime.fromisoformat(upd.replace("Z", "+00:00"))
|
|
except ValueError:
|
|
return False
|
|
if ts.tzinfo is None:
|
|
ts = ts.replace(tzinfo=UTC)
|
|
return ts < since
|
|
|
|
|
|
def _parse_date(value: Any) -> date | None:
|
|
if not value:
|
|
return None
|
|
if isinstance(value, date):
|
|
return value
|
|
text = str(value).strip()
|
|
if not text:
|
|
return None
|
|
try:
|
|
return date.fromisoformat(text[:10])
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
|
|
if not raw:
|
|
return None
|
|
parent = raw.get("parent") or {}
|
|
return RawStudio(
|
|
external_id=raw.get("id"),
|
|
name=raw.get("name") or "Unknown",
|
|
slug=None,
|
|
parent_external_id=parent.get("id"),
|
|
parent_name=parent.get("name"),
|
|
)
|
|
|
|
|
|
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
|
|
perf = raw.get("performer") or {}
|
|
name = perf.get("name")
|
|
if not name:
|
|
return None
|
|
aliases = perf.get("aliases") or []
|
|
if isinstance(aliases, str):
|
|
aliases = [a.strip() for a in aliases.split(",") if a.strip()]
|
|
bd_obj = perf.get("birthdate") or {}
|
|
bd = bd_obj.get("date") if isinstance(bd_obj, dict) else None
|
|
return RawPerformer(
|
|
external_id=perf.get("id"),
|
|
name=name,
|
|
aliases=[a for a in aliases if isinstance(a, str)],
|
|
gender=(perf.get("gender") or "").lower() or None,
|
|
birth_date=_parse_date(bd),
|
|
country=perf.get("country"),
|
|
as_alias_in_scene=raw.get("as") if raw.get("as") and raw.get("as") != name else None,
|
|
)
|
|
|
|
|
|
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
|
|
name = raw.get("name")
|
|
if not name:
|
|
return None
|
|
return RawTag(external_id=raw.get("id"), name=name, slug=None)
|
|
|
|
|
|
def _parse_fingerprint(raw: dict[str, Any]) -> RawFingerprint | None:
|
|
h = raw.get("hash")
|
|
algo = (raw.get("algorithm") or "").lower()
|
|
if not h or algo not in {"phash", "oshash", "md5"}:
|
|
return None
|
|
return RawFingerprint(kind=algo, value=h)
|
|
|
|
|
|
def _extract_cross_refs(urls: list[dict[str, Any]] | None) -> dict[str, str]:
|
|
"""Z `scene.urls` wyciąga znane cross-source ID-ki, np. tpdb_id.
|
|
|
|
Returns: dict[source_name, external_id]. Source name ma być stabilne
|
|
(lower, np. 'tpdb' / 'theporndb').
|
|
"""
|
|
out: dict[str, str] = {}
|
|
for u in urls or []:
|
|
url = u.get("url") or ""
|
|
site_name = ((u.get("site") or {}).get("name") or "").strip().lower()
|
|
if not url:
|
|
continue
|
|
# ThePornDB: .../scenes/<uuid>
|
|
if "theporndb" in site_name or "porndb" in url.lower():
|
|
m = _UUID_RE.search(url)
|
|
if m:
|
|
out["tpdb"] = m.group(0)
|
|
return out
|
|
|
|
|
|
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
|
|
external_id = raw.get("id")
|
|
title = raw.get("title")
|
|
if not external_id or not title:
|
|
log.warning("stashdb scene without id/title — skipping")
|
|
return None
|
|
|
|
performers = []
|
|
for p in raw.get("performers") or []:
|
|
parsed = _parse_performer(p)
|
|
if parsed is not None:
|
|
performers.append(parsed)
|
|
|
|
tags = []
|
|
for t in raw.get("tags") or []:
|
|
parsed_t = _parse_tag(t)
|
|
if parsed_t is not None:
|
|
tags.append(parsed_t)
|
|
|
|
fingerprints = []
|
|
for fp in raw.get("fingerprints") or []:
|
|
parsed_fp = _parse_fingerprint(fp)
|
|
if parsed_fp is not None:
|
|
fingerprints.append(parsed_fp)
|
|
|
|
cross_refs = _extract_cross_refs(raw.get("urls"))
|
|
rel = _parse_date(raw.get("release_date") or raw.get("date"))
|
|
|
|
return RawScene(
|
|
external_id=str(external_id),
|
|
title=title,
|
|
description=raw.get("details"),
|
|
release_date=rel,
|
|
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
|
|
code=raw.get("code"),
|
|
director=raw.get("director"),
|
|
url=None,
|
|
studio=_parse_studio(raw.get("studio")),
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
cross_source_refs=cross_refs,
|
|
raw=raw,
|
|
)
|