goon/app/connectors/stashdb.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

405 lines
12 KiB
Python

"""StashDB GraphQL connector.
Endpoint: https://stashdb.org/graphql (auth: header `ApiKey: <key>`)
Query używamy `queryScenes(input: {sort, direction, page, per_page})`. StashDB nie udostępnia
typowego date-since filtra w SceneQueryInput, więc deltę robimy klient-side: sortujemy po
UPDATED_AT DESC i przerywamy gdy `updated < since`.
Schema fields kluczowe (wg https://github.com/stashapp/stash-box/blob/master/graphql/schema/schema.graphql):
Scene { id title details date duration director code urls{url site{name}}
studio{id name parent{id name}}
performers{ as performer{ id name aliases gender birthdate{date} country } }
tags{ id name }
fingerprints{ hash algorithm duration } }
Cross-reference do TPDB: `urls[].site.name` zwykle zawiera "ThePornDB" + URL z UUID
(format: https://theporndb.net/scenes/<uuid>). Wyciągamy ten UUID jako tpdb cross-ref;
ingest_orchestrator może go potem użyć do path 2 (cross-source UUID).
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import UTC, date, datetime
from typing import Any
import httpx
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from app.config import get_settings
from app.connectors.base import (
BaseConnector,
RawFingerprint,
RawPerformer,
RawScene,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
SCENES_QUERY = """
query QScenes($input: SceneQueryInput!) {
queryScenes(input: $input) {
count
scenes {
id
title
details
release_date
date
duration
director
code
updated
urls { url site { name } }
studio {
id name
parent { id name }
}
performers {
as
performer {
id
name
aliases
gender
birthdate { date }
country
}
}
tags { id name }
fingerprints { hash algorithm duration }
}
}
}
"""
# UUID v4-ish pattern (relaxed)
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I)
class StashDBConnector(BaseConnector):
kind = SourceKind.stashdb
name = "stashdb"
def __init__(
self,
*,
api_key: str | None = None,
url: str | None = None,
per_page: int = 100,
timeout: float = 30.0,
) -> None:
settings = get_settings()
self.api_key = api_key or settings.stashdb_api_key
if not self.api_key:
raise RuntimeError("STASHDB_API_KEY is not set")
self.url = url or settings.stashdb_graphql_url
self.per_page = per_page
self.timeout = timeout
def _client(self) -> httpx.Client:
return httpx.Client(
headers={
"ApiKey": self.api_key,
"Accept": "application/json",
"Content-Type": "application/json",
"User-Agent": "goon/0.1",
},
timeout=self.timeout,
)
@retry(
retry=retry_if_exception_type((httpx.TransportError, httpx.HTTPStatusError)),
wait=wait_exponential(multiplier=1, min=2, max=30),
stop=stop_after_attempt(5),
reraise=True,
)
def _post(self, client: httpx.Client, payload: dict[str, Any]) -> dict[str, Any]:
resp = client.post(self.url, json=payload)
if resp.status_code == 429:
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
resp.raise_for_status()
body = resp.json()
if errors := body.get("errors"):
raise RuntimeError(f"stashdb graphql errors: {errors}")
return body["data"]
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
yield from self._paginate(
extra_input={"sort": "UPDATED_AT", "direction": "DESC"},
since=since,
limit=limit,
)
def find_performer_id_by_name(self, name: str) -> str | None:
"""queryPerformers(input: {name: <name>}) → pierwszy result.
StashDB GraphQL `name` to filter substring (case-insensitive). Zwracamy id
performera o exact match (case-insensitive) jeśli jest, inaczej pierwszy z listy.
"""
query = (
"query QPerformers($input: PerformerQueryInput!) {"
" queryPerformers(input: $input) { performers { id name } }"
"}"
)
variables = {"input": {"name": name, "per_page": 5}}
with self._client() as client:
try:
data = self._post(client, {"query": query, "variables": variables})
except Exception as e:
log.warning("stashdb queryPerformers name=%s failed: %s", name, e)
return None
performers = (data.get("queryPerformers") or {}).get("performers") or []
if not performers:
return None
target = name.strip().lower()
for p in performers:
if (p.get("name") or "").strip().lower() == target:
return p.get("id")
return performers[0].get("id")
def fetch_scenes_for_performer(
self,
performer_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Wszystkie sceny StashDB dla performera o podanym kanonicznym UUID.
StashDB SceneQueryInput.performers = MultiIDCriterionInput { value, modifier }.
Modifier INCLUDES = scena ma WSZYSTKIE wymienione UUID-y; przy 1 UUID = po prostu
sceny tego performera.
"""
yield from self._paginate(
extra_input={
"performers": {
"value": [performer_external_id],
"modifier": "INCLUDES",
},
"sort": "DATE",
"direction": "DESC",
},
since=None, # przy performer-scoped pull bierzemy całą historię
limit=limit,
)
def fetch_scenes_for_studio(
self,
studio_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Wszystkie sceny StashDB dla studio o podanym kanonicznym UUID.
Analogiczne do fetch_scenes_for_performer ale `studios` zamiast `performers`.
StashDB SceneQueryInput.studios = MultiIDCriterionInput { value, modifier }.
"""
yield from self._paginate(
extra_input={
"studios": {
"value": [studio_external_id],
"modifier": "INCLUDES",
},
"sort": "DATE",
"direction": "DESC",
},
since=None,
limit=limit,
)
def _paginate(
self,
*,
extra_input: dict[str, Any],
since: datetime | None,
limit: int | None,
) -> Iterator[RawScene]:
emitted = 0
page = 1
with self._client() as client:
while True:
variables = {
"input": {
"page": page,
"per_page": self.per_page,
**extra_input,
}
}
data = self._post(client, {"query": SCENES_QUERY, "variables": variables})
payload = data.get("queryScenes") or {}
scenes = payload.get("scenes") or []
if not scenes:
return
for raw in scenes:
if since is not None and _updated_before(raw, since):
return
parsed = _parse_scene(raw)
if parsed is None:
continue
yield parsed
emitted += 1
if limit is not None and emitted >= limit:
return
if len(scenes) < self.per_page:
return
page += 1
def _updated_before(raw: dict[str, Any], since: datetime) -> bool:
upd = raw.get("updated")
if not upd:
return False
try:
ts = datetime.fromisoformat(upd.replace("Z", "+00:00"))
except ValueError:
return False
if ts.tzinfo is None:
ts = ts.replace(tzinfo=UTC)
return ts < since
def _parse_date(value: Any) -> date | None:
if not value:
return None
if isinstance(value, date):
return value
text = str(value).strip()
if not text:
return None
try:
return date.fromisoformat(text[:10])
except ValueError:
return None
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
if not raw:
return None
parent = raw.get("parent") or {}
return RawStudio(
external_id=raw.get("id"),
name=raw.get("name") or "Unknown",
slug=None,
parent_external_id=parent.get("id"),
parent_name=parent.get("name"),
)
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
perf = raw.get("performer") or {}
name = perf.get("name")
if not name:
return None
aliases = perf.get("aliases") or []
if isinstance(aliases, str):
aliases = [a.strip() for a in aliases.split(",") if a.strip()]
bd_obj = perf.get("birthdate") or {}
bd = bd_obj.get("date") if isinstance(bd_obj, dict) else None
return RawPerformer(
external_id=perf.get("id"),
name=name,
aliases=[a for a in aliases if isinstance(a, str)],
gender=(perf.get("gender") or "").lower() or None,
birth_date=_parse_date(bd),
country=perf.get("country"),
as_alias_in_scene=raw.get("as") if raw.get("as") and raw.get("as") != name else None,
)
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
name = raw.get("name")
if not name:
return None
return RawTag(external_id=raw.get("id"), name=name, slug=None)
def _parse_fingerprint(raw: dict[str, Any]) -> RawFingerprint | None:
h = raw.get("hash")
algo = (raw.get("algorithm") or "").lower()
if not h or algo not in {"phash", "oshash", "md5"}:
return None
return RawFingerprint(kind=algo, value=h)
def _extract_cross_refs(urls: list[dict[str, Any]] | None) -> dict[str, str]:
"""Z `scene.urls` wyciąga znane cross-source ID-ki, np. tpdb_id.
Returns: dict[source_name, external_id]. Source name ma być stabilne
(lower, np. 'tpdb' / 'theporndb').
"""
out: dict[str, str] = {}
for u in urls or []:
url = u.get("url") or ""
site_name = ((u.get("site") or {}).get("name") or "").strip().lower()
if not url:
continue
# ThePornDB: .../scenes/<uuid>
if "theporndb" in site_name or "porndb" in url.lower():
m = _UUID_RE.search(url)
if m:
out["tpdb"] = m.group(0)
return out
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
external_id = raw.get("id")
title = raw.get("title")
if not external_id or not title:
log.warning("stashdb scene without id/title — skipping")
return None
performers = []
for p in raw.get("performers") or []:
parsed = _parse_performer(p)
if parsed is not None:
performers.append(parsed)
tags = []
for t in raw.get("tags") or []:
parsed_t = _parse_tag(t)
if parsed_t is not None:
tags.append(parsed_t)
fingerprints = []
for fp in raw.get("fingerprints") or []:
parsed_fp = _parse_fingerprint(fp)
if parsed_fp is not None:
fingerprints.append(parsed_fp)
cross_refs = _extract_cross_refs(raw.get("urls"))
rel = _parse_date(raw.get("release_date") or raw.get("date"))
return RawScene(
external_id=str(external_id),
title=title,
description=raw.get("details"),
release_date=rel,
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
code=raw.get("code"),
director=raw.get("director"),
url=None,
studio=_parse_studio(raw.get("studio")),
performers=performers,
tags=tags,
fingerprints=fingerprints,
cross_source_refs=cross_refs,
raw=raw,
)