"""StashDB GraphQL connector. Endpoint: https://stashdb.org/graphql (auth: header `ApiKey: `) Query używamy `queryScenes(input: {sort, direction, page, per_page})`. StashDB nie udostępnia typowego date-since filtra w SceneQueryInput, więc deltę robimy klient-side: sortujemy po UPDATED_AT DESC i przerywamy gdy `updated < since`. Schema fields kluczowe (wg https://github.com/stashapp/stash-box/blob/master/graphql/schema/schema.graphql): Scene { id title details date duration director code urls{url site{name}} studio{id name parent{id name}} performers{ as performer{ id name aliases gender birthdate{date} country } } tags{ id name } fingerprints{ hash algorithm duration } } Cross-reference do TPDB: `urls[].site.name` zwykle zawiera "ThePornDB" + URL z UUID (format: https://theporndb.net/scenes/). Wyciągamy ten UUID jako tpdb cross-ref; ingest_orchestrator może go potem użyć do path 2 (cross-source UUID). """ from __future__ import annotations import logging import re from collections.abc import Iterator from datetime import UTC, date, datetime from typing import Any import httpx from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from app.config import get_settings from app.connectors.base import ( BaseConnector, RawFingerprint, RawPerformer, RawScene, RawStudio, RawTag, ) from app.models.source import SourceKind log = logging.getLogger(__name__) SCENES_QUERY = """ query QScenes($input: SceneQueryInput!) { queryScenes(input: $input) { count scenes { id title details release_date date duration director code updated urls { url site { name } } studio { id name parent { id name } } performers { as performer { id name aliases gender birthdate { date } country } } tags { id name } fingerprints { hash algorithm duration } } } } """ # UUID v4-ish pattern (relaxed) _UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I) class StashDBConnector(BaseConnector): kind = SourceKind.stashdb name = "stashdb" def __init__( self, *, api_key: str | None = None, url: str | None = None, per_page: int = 100, timeout: float = 30.0, ) -> None: settings = get_settings() self.api_key = api_key or settings.stashdb_api_key if not self.api_key: raise RuntimeError("STASHDB_API_KEY is not set") self.url = url or settings.stashdb_graphql_url self.per_page = per_page self.timeout = timeout def _client(self) -> httpx.Client: return httpx.Client( headers={ "ApiKey": self.api_key, "Accept": "application/json", "Content-Type": "application/json", "User-Agent": "goon/0.1", }, timeout=self.timeout, ) @retry( retry=retry_if_exception_type((httpx.TransportError, httpx.HTTPStatusError)), wait=wait_exponential(multiplier=1, min=2, max=30), stop=stop_after_attempt(5), reraise=True, ) def _post(self, client: httpx.Client, payload: dict[str, Any]) -> dict[str, Any]: resp = client.post(self.url, json=payload) if resp.status_code == 429: raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp) resp.raise_for_status() body = resp.json() if errors := body.get("errors"): raise RuntimeError(f"stashdb graphql errors: {errors}") return body["data"] def fetch_scenes( self, *, since: datetime | None = None, limit: int | None = None, ) -> Iterator[RawScene]: yield from self._paginate( extra_input={"sort": "UPDATED_AT", "direction": "DESC"}, since=since, limit=limit, ) def find_performer_id_by_name(self, name: str) -> str | None: """queryPerformers(input: {name: }) → pierwszy result. StashDB GraphQL `name` to filter substring (case-insensitive). Zwracamy id performera o exact match (case-insensitive) jeśli jest, inaczej pierwszy z listy. """ query = ( "query QPerformers($input: PerformerQueryInput!) {" " queryPerformers(input: $input) { performers { id name } }" "}" ) variables = {"input": {"name": name, "per_page": 5}} with self._client() as client: try: data = self._post(client, {"query": query, "variables": variables}) except Exception as e: log.warning("stashdb queryPerformers name=%s failed: %s", name, e) return None performers = (data.get("queryPerformers") or {}).get("performers") or [] if not performers: return None target = name.strip().lower() for p in performers: if (p.get("name") or "").strip().lower() == target: return p.get("id") return performers[0].get("id") def fetch_scenes_for_performer( self, performer_external_id: str, *, limit: int | None = None, ) -> Iterator[RawScene]: """Wszystkie sceny StashDB dla performera o podanym kanonicznym UUID. StashDB SceneQueryInput.performers = MultiIDCriterionInput { value, modifier }. Modifier INCLUDES = scena ma WSZYSTKIE wymienione UUID-y; przy 1 UUID = po prostu sceny tego performera. """ yield from self._paginate( extra_input={ "performers": { "value": [performer_external_id], "modifier": "INCLUDES", }, "sort": "DATE", "direction": "DESC", }, since=None, # przy performer-scoped pull bierzemy całą historię limit=limit, ) def fetch_scenes_for_studio( self, studio_external_id: str, *, limit: int | None = None, ) -> Iterator[RawScene]: """Wszystkie sceny StashDB dla studio o podanym kanonicznym UUID. Analogiczne do fetch_scenes_for_performer ale `studios` zamiast `performers`. StashDB SceneQueryInput.studios = MultiIDCriterionInput { value, modifier }. """ yield from self._paginate( extra_input={ "studios": { "value": [studio_external_id], "modifier": "INCLUDES", }, "sort": "DATE", "direction": "DESC", }, since=None, limit=limit, ) def _paginate( self, *, extra_input: dict[str, Any], since: datetime | None, limit: int | None, ) -> Iterator[RawScene]: emitted = 0 page = 1 with self._client() as client: while True: variables = { "input": { "page": page, "per_page": self.per_page, **extra_input, } } data = self._post(client, {"query": SCENES_QUERY, "variables": variables}) payload = data.get("queryScenes") or {} scenes = payload.get("scenes") or [] if not scenes: return for raw in scenes: if since is not None and _updated_before(raw, since): return parsed = _parse_scene(raw) if parsed is None: continue yield parsed emitted += 1 if limit is not None and emitted >= limit: return if len(scenes) < self.per_page: return page += 1 def _updated_before(raw: dict[str, Any], since: datetime) -> bool: upd = raw.get("updated") if not upd: return False try: ts = datetime.fromisoformat(upd.replace("Z", "+00:00")) except ValueError: return False if ts.tzinfo is None: ts = ts.replace(tzinfo=UTC) return ts < since def _parse_date(value: Any) -> date | None: if not value: return None if isinstance(value, date): return value text = str(value).strip() if not text: return None try: return date.fromisoformat(text[:10]) except ValueError: return None def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None: if not raw: return None parent = raw.get("parent") or {} return RawStudio( external_id=raw.get("id"), name=raw.get("name") or "Unknown", slug=None, parent_external_id=parent.get("id"), parent_name=parent.get("name"), ) def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None: perf = raw.get("performer") or {} name = perf.get("name") if not name: return None aliases = perf.get("aliases") or [] if isinstance(aliases, str): aliases = [a.strip() for a in aliases.split(",") if a.strip()] bd_obj = perf.get("birthdate") or {} bd = bd_obj.get("date") if isinstance(bd_obj, dict) else None return RawPerformer( external_id=perf.get("id"), name=name, aliases=[a for a in aliases if isinstance(a, str)], gender=(perf.get("gender") or "").lower() or None, birth_date=_parse_date(bd), country=perf.get("country"), as_alias_in_scene=raw.get("as") if raw.get("as") and raw.get("as") != name else None, ) def _parse_tag(raw: dict[str, Any]) -> RawTag | None: name = raw.get("name") if not name: return None return RawTag(external_id=raw.get("id"), name=name, slug=None) def _parse_fingerprint(raw: dict[str, Any]) -> RawFingerprint | None: h = raw.get("hash") algo = (raw.get("algorithm") or "").lower() if not h or algo not in {"phash", "oshash", "md5"}: return None return RawFingerprint(kind=algo, value=h) def _extract_cross_refs(urls: list[dict[str, Any]] | None) -> dict[str, str]: """Z `scene.urls` wyciąga znane cross-source ID-ki, np. tpdb_id. Returns: dict[source_name, external_id]. Source name ma być stabilne (lower, np. 'tpdb' / 'theporndb'). """ out: dict[str, str] = {} for u in urls or []: url = u.get("url") or "" site_name = ((u.get("site") or {}).get("name") or "").strip().lower() if not url: continue # ThePornDB: .../scenes/ if "theporndb" in site_name or "porndb" in url.lower(): m = _UUID_RE.search(url) if m: out["tpdb"] = m.group(0) return out def _parse_scene(raw: dict[str, Any]) -> RawScene | None: external_id = raw.get("id") title = raw.get("title") if not external_id or not title: log.warning("stashdb scene without id/title — skipping") return None performers = [] for p in raw.get("performers") or []: parsed = _parse_performer(p) if parsed is not None: performers.append(parsed) tags = [] for t in raw.get("tags") or []: parsed_t = _parse_tag(t) if parsed_t is not None: tags.append(parsed_t) fingerprints = [] for fp in raw.get("fingerprints") or []: parsed_fp = _parse_fingerprint(fp) if parsed_fp is not None: fingerprints.append(parsed_fp) cross_refs = _extract_cross_refs(raw.get("urls")) rel = _parse_date(raw.get("release_date") or raw.get("date")) return RawScene( external_id=str(external_id), title=title, description=raw.get("details"), release_date=rel, duration_sec=int(raw["duration"]) if raw.get("duration") else None, code=raw.get("code"), director=raw.get("director"), url=None, studio=_parse_studio(raw.get("studio")), performers=performers, tags=tags, fingerprints=fingerprints, cross_source_refs=cross_refs, raw=raw, )