goon/app/connectors/base.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

187 lines
5.9 KiB
Python

"""Kontrakt connectora źródła + neutralne DTO surowych rekordów.
Connector odpowiada za: paginację, retry, autoryzację, deltę. Zwraca strumień RawScene
(z ewentualnymi pre-rozwiniętymi performerami/studiem/tagami w polach inline). Cała
mechanika DB i normalizacji żyje wyżej w pipeline'ie ingest.
"""
from __future__ import annotations
import abc
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
from pydantic import BaseModel, ConfigDict, Field
from app.models.source import SourceKind
class RawTag(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
slug: str | None = None
class RawStudio(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
slug: str | None = None
parent_external_id: str | None = None
parent_name: str | None = None
network: str | None = None
homepage_url: str | None = None
class RawPerformer(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
aliases: list[str] = Field(default_factory=list)
gender: str | None = None
birth_date: date | None = None
country: str | None = None
as_alias_in_scene: str | None = None # imię użyte w tej konkretnej scenie (np. „Mia M.")
class RawFingerprint(BaseModel):
kind: str # phash | oshash | md5
value: str
class RawPlaybackSource(BaseModel):
"""Link do odtworzenia sceny z konkretnego tube/agregatora."""
model_config = ConfigDict(extra="allow")
origin: str
"""Krótka nazwa źródła, np. 'tube:hqpornercom', 'mangoporn:doodstream'."""
page_url: str
"""URL strony tube'a z player'em (deep link)."""
embed_url: str | None = None
stream_url: str | None = None
quality: str | None = None
duration_sec: int | None = None
thumbnail_url: str | None = None
animated_thumbnail_url: str | None = None
class RawScene(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str
title: str
description: str | None = None
release_date: date | None = None
duration_sec: int | None = None
code: str | None = None
director: str | None = None
url: str | None = None
studio: RawStudio | None = None
performers: list[RawPerformer] = Field(default_factory=list)
tags: list[RawTag] = Field(default_factory=list)
fingerprints: list[RawFingerprint] = Field(default_factory=list)
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
cross_source_refs: dict[str, str] = Field(default_factory=dict)
"""Mapowanie source_name → external_id deklarowane przez to źródło. Używane do path 2
w resolverze (cross-source UUID match). Klucz zgadza się z `Source.name` w DB
(np. 'tpdb', 'stashdb')."""
raw: dict[str, Any] = Field(default_factory=dict)
"""Oryginalny payload z API — leci do external_records.raw."""
class BaseConnector(abc.ABC):
"""Każde źródło dziedziczy. `kind` mapuje 1:1 na SourceKind w DB."""
kind: SourceKind
name: str
@abc.abstractmethod
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Yield po jednej scenie. `since` to delta filter (opcjonalna, fallback do full)."""
raise NotImplementedError
# ---------------------------------------------------------------------------
# Movies — odrębny encja od scen, ale ten sam wzorzec connectorów
# ---------------------------------------------------------------------------
class RawMovieChapter(BaseModel):
"""Pojedynczy rozdział filmu (movies czasem dzielą się na "Part 1/2/3" itp.).
Identyfikatory chaptera nie są kanonizowane między źródłami — są lokalne dla movie,
indeksowane przez `chapter_index`. Może linkować do separate scene (jeśli ta scena
znana z TPDB/StashDB) — tym zajmuje się normalizator wyżej."""
model_config = ConfigDict(extra="allow")
chapter_index: int
title: str | None = None
start_sec: int | None = None
end_sec: int | None = None
class RawMovie(BaseModel):
"""Surowy film z connectora — odpowiednik RawScene dla movies.
Performers / studio / tags reusable z RawPerformer / RawStudio / RawTag (te same
typy w obu pipelinach). Playback sources to lista mirrorów odtwarzania (paradisehill
primary, ewentualnie inne tube'y).
"""
model_config = ConfigDict(extra="allow")
external_id: str
title: str
description: str | None = None
release_year: int | None = None
release_date: date | None = None
duration_sec: int | None = None
director: str | None = None
country: str | None = None
rating: float | None = None
poster_url: str | None = None
backdrop_url: str | None = None
url: str | None = None
studio: RawStudio | None = None
performers: list[RawPerformer] = Field(default_factory=list)
tags: list[RawTag] = Field(default_factory=list)
chapters: list[RawMovieChapter] = Field(default_factory=list)
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
cross_source_refs: dict[str, str] = Field(default_factory=dict)
raw: dict[str, Any] = Field(default_factory=dict)
class BaseMovieConnector(abc.ABC):
"""Connector dla source'a movies (paradisehill, psyplay, wp_movies).
Symetrycznie do BaseConnector ale yielduje RawMovie. Każde źródło zna własną
paginację i format ID — konwerter wyżej (resolver) dba o dedup między źródłami.
"""
kind: SourceKind
name: str
@abc.abstractmethod
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
"""Yield po jednym filmie. `since` opcjonalne, fallback do full crawl."""
raise NotImplementedError