goon/app/api/scenes.py

"""GET /scenes — lista i szczegóły scen z bazy kanonicznej."""
from __future__ import annotations

import logging
import re
import uuid
from typing import Annotated

from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel
from sqlalchemy import distinct, exists, false, func, literal_column, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session

from app.auth import require_api_key

from app.api.schemas import (
    ExternalRefOut,
    PerformerOut,
    PlaybackSourceOut,
    SceneListOut,
    SceneOut,
    StudioOut,
    TagOut,
)
from app.db import get_session
from app.models.favorite_scene import FavoriteScene
from app.models.performer import Performer
from app.models.play_progress import ScenePlayProgress
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneExternalRef, ScenePerformer, SceneTag
from app.models.source import Source, SourceKind
from app.models.studio import Studio
from app.models.tag import Tag

log = logging.getLogger(__name__)

router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(require_api_key)])


_VALID_SORTS = {"created_at", "release_date", "title", "studio"}

# TTL-cache dla count'u scen-z-żywym-playback (default lista bez filtra). Full-scan
# 1.69M scen + EXISTS ~950ms; liczba zmienia się wolno i jest przybliżona (header
# paginacji), więc 10-min cache w pamięci procesu API jest akceptowalny trade-off.
_DEFAULT_COUNT_CACHE: dict = {"ts": 0.0, "val": 0}
_DEFAULT_COUNT_TTL = 600.0

def _default_scene_count(session: Session) -> int:
    import time as _time
    now = _time.monotonic()
    if _DEFAULT_COUNT_CACHE["val"] and (now - _DEFAULT_COUNT_CACHE["ts"]) < _DEFAULT_COUNT_TTL:
        return _DEFAULT_COUNT_CACHE["val"]
    count_query = select(func.count()).select_from(
        select(Scene.id).where(
            exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                )
            )
        ).subquery()
    )
    total = session.execute(count_query).scalar_one()
    _DEFAULT_COUNT_CACHE["ts"] = now
    _DEFAULT_COUNT_CACHE["val"] = total
    return total


# Blacklisty (performer/studio/tag) są zwykle PUSTE (self-hosted, single-user). Mimo to
# 3 NOT EXISTS klauzule doklejały się do KAŻDEJ filtrowanej listy scen i były ewaluowane
# per-row — przy filtrze typu duży-tag/has_playback planer chodzi po ~176k scen, więc te
# puste-zawsze klauzule kosztowały ~3.4s (mega-tag „anal": 6.7s→3.3s po pominięciu).
# Cache'ujemy emptiness (TTL 5 min); gdy ktoś doda blacklist-wpis, w ciągu 5 min klauzule
# wracają. Patrz reference_scenes_list_perf / task #22.
_BLACKLIST_EMPTY_CACHE: dict = {"ts": 0.0, "val": False, "checked": False}
_BLACKLIST_EMPTY_TTL = 300.0


def _blacklists_empty(session: Session) -> bool:
    """True gdy WSZYSTKIE 3 blacklisty puste → można pominąć NOT EXISTS klauzule."""
    import time as _time
    from app.models.blacklist import (
        BlacklistedPerformer,
        BlacklistedStudio,
        BlacklistedTag,
    )
    now = _time.monotonic()
    if _BLACKLIST_EMPTY_CACHE["checked"] and (now - _BLACKLIST_EMPTY_CACHE["ts"]) < _BLACKLIST_EMPTY_TTL:
        return _BLACKLIST_EMPTY_CACHE["val"]
    has_any = session.execute(
        select(
            exists(select(1).select_from(BlacklistedPerformer))
            | exists(select(1).select_from(BlacklistedStudio))
            | exists(select(1).select_from(BlacklistedTag))
        )
    ).scalar_one()
    _BLACKLIST_EMPTY_CACHE["ts"] = now
    _BLACKLIST_EMPTY_CACHE["val"] = not has_any
    _BLACKLIST_EMPTY_CACHE["checked"] = True
    return not has_any


def _split_csv(raw: str | None) -> list[str]:
    if not raw:
        return []
    return [s.strip() for s in raw.split(",") if s.strip()]


@router.get("", response_model=SceneListOut)
def list_scenes(
    session: Annotated[Session, Depends(get_session)],
    q: str | None = Query(default=None, description="Wyszukiwanie po title_normalized (trgm)"),
    studio_slug: str | None = Query(default=None, description="DEPRECATED — użyj studio_slugs"),
    studio_slugs: str | None = Query(
        default=None, description="Comma-separated studio slugs (OR)"
    ),
    tags: str | None = Query(
        default=None,
        description="Comma-separated tag slugs (AND — scena musi mieć wszystkie wybrane tagi)",
    ),
    performer_ids: str | None = Query(
        default=None,
        description="Comma-separated performer UUIDs (AND — scena musi mieć wszystkich wybranych performerów)",
    ),
    has_playback: bool | None = Query(
        default=None, description="True: tylko sceny z ≥1 playback_source"
    ),
    has_animated_thumbnail: bool | None = Query(
        default=None,
        description="True: tylko sceny z ≥1 playback_source z animated_thumbnail_url (hold-to-preview)",
    ),
    min_duration_sec: int | None = Query(default=None, ge=0),
    max_duration_sec: int | None = Query(default=None, ge=0),
    released_within_days: int | None = Query(
        default=None, ge=1,
        description="Tylko sceny released w ostatnich N dniach",
    ),
    min_quality_p: int | None = Query(
        default=None, ge=1,
        description=(
            "Minimum quality (pixele wysokości — 2160 = 4K, 1080 = FullHD). Filtruje "
            "po PlaybackSource.quality (string typu '720p' / '1080p Full HD')."
        ),
    ),
    origin: str | None = Query(
        default=None,
        description=(
            "Filtruj po playback origin (np. 'tube:hqpornercom'). Substring match — "
            "'hqporner' złapie tube:hqpornercom. Diagnostyka per-hoster."
        ),
    ),
    include_stubs: bool = Query(
        default=False,
        description=(
            "False (default): ukrywa sceny-szkielety bez release_date, < 10min, "
            "z jedynym playback z hqporner (~7-min Brazzers trailer clipy zalewają katalog)."
        ),
    ),
    sort: str = Query(default="created_at", description="created_at|release_date|title|studio"),
    page: int = Query(default=1, ge=1),
    per_page: int = Query(default=50, ge=1, le=200),
) -> SceneListOut:
    if sort not in _VALID_SORTS:
        raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")

    base = select(Scene)

    if q:
        base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))

    studio_slug_list = _split_csv(studio_slugs)
    if studio_slug:
        studio_slug_list.append(studio_slug)
    if studio_slug_list:
        base = base.where(
            Scene.studio_id.in_(
                select(Studio.id).where(Studio.slug.in_(studio_slug_list))
            )
        )

    tag_slug_list = _split_csv(tags)
    # AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
    # exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
    #
    # PERF (2026-06-07): resolvujemy slug→tag_id w aplikacji i filtrujemy po LITERALNYM
    # tag_id (NIE JOIN po Tag.slug). Z literałem planner zna kardynalność tagu ze
    # statystyk (MCV) → dla popularnych tagów (blowjob ~273k scen) wybiera index-walk po
    # ix_scenes_created_at_desc zamiast materializować wszystkie scene_tags. Slug-JOIN
    # ukrywał tag_id przed plannerem → używał średniej (8.4M/11541≈726) → zły plan
    # (4-12s). Z literałem: ~20ms. Zob. też _build... light mode.
    if tag_slug_list:
        id_by_slug = dict(
            session.execute(
                select(Tag.slug, Tag.id).where(Tag.slug.in_(tag_slug_list))
            ).all()
        )
        for slug in tag_slug_list:
            tag_id = id_by_slug.get(slug)
            if tag_id is None:
                base = base.where(false())  # nieznany slug → brak wyników
                break
            base = base.where(
                exists(
                    select(1)
                    .select_from(SceneTag)
                    .where(SceneTag.scene_id == Scene.id, SceneTag.tag_id == tag_id)
                )
            )

    perf_id_strings = _split_csv(performer_ids)
    if perf_id_strings:
        try:
            perf_ids = [uuid.UUID(s) for s in perf_id_strings]
        except ValueError as e:
            raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
        # AND między performerami (analogicznie do tagów).
        for pid in perf_ids:
            base = base.where(
                exists(
                    select(1)
                    .select_from(ScenePerformer)
                    .where(
                        ScenePerformer.scene_id == Scene.id,
                        ScenePerformer.performer_id == pid,
                    )
                )
            )

    if has_playback is True:
        # Tylko sceny z choć jednym ŻYWYM playback_source.
        base = base.where(
            exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                )
            )
        )
    elif has_playback is False:
        base = base.where(
            ~exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                )
            )
        )

    if origin:
        # Substring match na origin — 'hqporner' złapie 'tube:hqpornercom'.
        base = base.where(
            exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                    PlaybackSource.origin.ilike(f"%{origin}%"),
                )
            )
        )

    # Blacklisty — globalne wykluczenia. Jeśli scena ma JAKIEGOKOLWIEK blacklisted
    # performera, jest na blacklisted studio, lub ma JAKIKOLWIEK blacklisted tag → out.
    # Pomijamy gdy wszystkie 3 blacklisty puste (typowy stan single-user) — te NOT EXISTS
    # ewaluują się per-row na ~176k scen przy mega-tagu i kosztowały ~3.4s za nic.
    if not _blacklists_empty(session):
        from app.models.blacklist import (
            BlacklistedPerformer,
            BlacklistedStudio,
            BlacklistedTag,
        )
        base = base.where(
            ~exists(
                select(1)
                .select_from(ScenePerformer)
                .join(BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id)
                .where(ScenePerformer.scene_id == Scene.id)
            )
        )
        base = base.where(
            ~Scene.studio_id.in_(select(BlacklistedStudio.studio_id))
        )
        base = base.where(
            ~exists(
                select(1)
                .select_from(SceneTag)
                .join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id)
                .where(SceneTag.scene_id == Scene.id)
            )
        )

    if has_animated_thumbnail:
        base = base.where(
            exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                    PlaybackSource.animated_thumbnail_url.isnot(None),
                )
            )
        )

    if min_duration_sec is not None:
        base = base.where(Scene.duration_sec >= min_duration_sec)
    if max_duration_sec is not None:
        base = base.where(Scene.duration_sec <= max_duration_sec)

    if released_within_days is not None:
        from datetime import date, timedelta
        cutoff = date.today() - timedelta(days=released_within_days)
        base = base.where(Scene.release_date >= cutoff)

    if min_quality_p is not None:
        # PlaybackSource.quality to wolny string — szukamy liczb w prefixie ('1080p',
        # '1080p Full HD', '2160p'). Heurystyka: wystarczy że scena ma JEDEN żywy
        # playback z quality liczbą >= min. '4K'/'UHD' aliasujemy na 2160.
        from sqlalchemy import Integer, cast, or_
        numeric_q = cast(
            func.coalesce(func.substring(PlaybackSource.quality, r"\d+"), "0"),
            Integer,
        )
        conds = [numeric_q >= min_quality_p]
        if min_quality_p <= 2160:
            conds.append(PlaybackSource.quality.ilike("%4k%"))
            conds.append(PlaybackSource.quality.ilike("%uhd%"))
        base = base.where(
            exists(
                select(1).where(
                    PlaybackSource.scene_id == Scene.id,
                    PlaybackSource.dead_at.is_(None),
                    PlaybackSource.quality.isnot(None),
                    or_(*conds),
                )
            )
        )

    if not include_stubs:
        # Stub scene heuristic: tube-only scena BEZ release_date AND BEZ canonical
        # (TPDB/StashDB) ref AND BEZ żadnego ScenePerformer linka. ScenePerformer
        # dodaje continuous worker (search-by-name → wymusza link), więc per-performer
        # search-result NIGDY nie jest stub. To filtruje tylko anonymous tube-only
        # sceny z newUrl/categories ingestu które nie zostały zsyntowane z performerem.
        canonical_exists = exists(
            select(1)
            .select_from(SceneExternalRef)
            .join(Source, Source.id == SceneExternalRef.source_id)
            .where(SceneExternalRef.scene_id == Scene.id)
            .where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
        )
        has_performer = exists(
            select(1).where(ScenePerformer.scene_id == Scene.id)
        )
        # NOT stub gdy: ma canonical_ref OR ma release_date OR ma performera
        base = base.where(
            Scene.release_date.is_not(None) | canonical_exists | has_performer
        )

    _is_pure_default = (
        not include_stubs and not q and not studio_slug_list and not tag_slug_list
        and not perf_id_strings and origin is None and has_playback is None
        and not has_animated_thumbnail and min_duration_sec is None
        and max_duration_sec is None and released_within_days is None
        and min_quality_p is None
    )
    # Count strategy:
    #  - PURE default: cached pełny licznik katalogu (TTL 10 min).
    #  - FILTROWANE: NIE liczymy dokładnie. Bounded-count nad EXISTS-filtrami był
    #    dominującym kosztem (~4s na has_playback / min_duration / duży tag) i plan
    #    był NIESTABILNY (literal LIMIT + count-nad-PK pomogły w części przypadków,
    #    ale planer i tak czasem skanuje cały zbiór zamiast urwać). Mobile paginuje
    #    po `has_more` (per_page+1 fetch), NIE po `total` — `total` to tylko licznik
    #    "N+" w UI. Wyprowadzamy go z has_more PO fetchu (patrz niżej): dolna granica
    #    + flaga "jest więcej". Eliminuje cały koszt count z każdej filtrowanej listy.
    total_capped = False
    total: int | None = _default_scene_count(session) if _is_pure_default else None

    # Sort: zawsze tie-break po created_at desc dla determinizmu paginacji.
    if sort == "release_date":
        ordered = base.order_by(
            Scene.release_date.desc().nullslast(), Scene.created_at.desc()
        )
    elif sort == "title":
        ordered = base.order_by(Scene.title_normalized.asc(), Scene.created_at.desc())
    elif sort == "studio":
        # Sceny bez studio na końcu; w obrębie studio — najświeższe pierwsze.
        ordered = (
            base.outerjoin(Studio, Studio.id == Scene.studio_id)
            .order_by(
                Studio.name_normalized.asc().nullslast(),
                Scene.release_date.desc().nullslast(),
                Scene.created_at.desc(),
            )
        )
    else:  # created_at
        ordered = base.order_by(
            Scene.created_at.desc(), Scene.release_date.desc().nullslast()
        )

    # Fetch per_page+1 — obecność (per_page+1)-szego wiersza = jest kolejna strona.
    # To źródło prawdy dla paginacji (mobile getNextPageParam), niezależne od bounded
    # `total`. Nadmiarowy wiersz odcinamy przed serializacją.
    # LIMIT/OFFSET literalne (NIE bound-param) — patrz wyżej: sparametryzowany LIMIT
    # psuje early-termination i przy filtrach EXISTS planer robi gather-all+sort (sekundy)
    # zamiast limit-aware index-walk po `ix_scenes_created_at_desc`. page/per_page to
    # walidowane inty (Query ge=1, le=200), więc literal_column jest bezpieczne.
    _off = (page - 1) * per_page
    rows = (
        session.execute(
            ordered.offset(literal_column(str(_off))).limit(literal_column(str(per_page + 1)))
        )
        .scalars()
        .all()
    )
    has_more = len(rows) > per_page
    rows = rows[:per_page]

    # Filtrowane listy: total = dolna granica z dotychczas-widzianych wierszy, a
    # total_capped=has_more daje UI "N+" (jest kolejna strona). Bez osobnego count query.
    if total is None:
        total = (page - 1) * per_page + len(rows)
        total_capped = has_more

    items = _build_scenes_out_batch(session, list(rows), light=True)

    return SceneListOut(
        items=items,
        total=total,
        page=page,
        per_page=per_page,
        has_more=has_more,
        total_capped=total_capped,
    )


@router.get("/{scene_id}", response_model=SceneOut)
def get_scene(
    scene_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> SceneOut:
    scene = session.get(Scene, scene_id)
    if scene is None:
        raise HTTPException(status_code=404, detail="scene not found")
    return _build_scene_out(session, scene)


def _needs_proxy(url: str) -> bool:
    """Wszystkie thumbnaile z playback_sources są proxowane przez backend.
    Większość CDN-ów porn-tube'ów wymaga Refera (hqporner, mypornerleak/58img,
    inne sxyprn/eporner CDN-y) — expo-image nie wysyła Referera.
    Self-hosted lub backend-internal URL-e (zaczynające się od `/`) skipujemy."""
    return url.startswith("http") and not url.startswith("/proxy/")


def _wrap_image_proxy(url: str, referer: str) -> str:
    """Wraps a thumbnail URL through /proxy/img/{token}/img.jpg. Klient nie musi
    znać sekretu Referer — backend wstawi sam. Long TTL (30d) bo thumby
    są stabilne, krótkie ttl by tylko niepotrzebnie zaśmiecało cache."""
    from app.api.stream_proxy import make_token
    token = make_token(url, referer, ttl_sec=30 * 24 * 3600)
    # Path zachowuje rozszerzenie żeby HTTP Content-Type był rozpoznany.
    import os as _os
    ext = _os.path.splitext(url.split("?")[0])[1].lstrip(".") or "jpg"
    return f"/proxy/img/{token}/img.{ext}"


def _build_scenes_out_batch(
    session: Session, scenes: list[Scene], *, light: bool = False
) -> list[SceneOut]:
    """Batch-fetch wszystkich relacji dla N scen w 7 zapytaniach (zamiast 7×N).

    Eliminuje N+1 z `_build_scene_out` w listach scen — `/scenes?per_page=24` szło
    z ~9.6s do <500ms. Pojedyncza scena (`/scenes/{id}`) nadal używa `_build_scene_out`
    bo overhead na batch nie ma sensu dla N=1.

    `light=True` (listy/grid): pomija `tags` i `external_refs` (kafelek SceneTile ich
    NIE używa, a SceneDetail re-fetchuje pełną scenę osobno) i ślimaczy `playback_sources`
    do 1 wpisu z samą miniaturką (kafelek czyta tylko thumbnail_url/animated_thumbnail_url).
    Mniej DB + mniej payloadu + szybszy parse na kliencie (perf 2026-06-07).
    """
    from collections import defaultdict
    if not scenes:
        return []

    scene_ids = [s.id for s in scenes]
    studio_ids = list({s.studio_id for s in scenes if s.studio_id is not None})

    # 1) Studios
    studios_by_id: dict = {}
    if studio_ids:
        for st in session.execute(
            select(Studio).where(Studio.id.in_(studio_ids))
        ).scalars():
            studios_by_id[st.id] = st

    # 2) Performers
    perf_rows = session.execute(
        select(ScenePerformer, Performer)
        .join(Performer, Performer.id == ScenePerformer.performer_id)
        .where(ScenePerformer.scene_id.in_(scene_ids))
        .order_by(ScenePerformer.position.asc().nullslast())
    ).all()
    performers_by_scene: dict = defaultdict(list)
    for sp, p in perf_rows:
        performers_by_scene[sp.scene_id].append(
            PerformerOut(
                id=p.id,
                canonical_name=p.canonical_name,
                slug=p.slug,
                gender=p.gender.value if p.gender else None,
                as_alias=sp.as_alias,
            )
        )

    # 3) Tags + 4) External refs — kafelek listy ich nie używa; w light mode pomijamy
    # (SceneDetail re-fetchuje pełną scenę przez /scenes/{id}).
    tags_by_scene: dict = defaultdict(list)
    refs_by_scene: dict = defaultdict(list)
    if not light:
        tag_rows = session.execute(
            select(SceneTag.scene_id, Tag)
            .join(Tag, Tag.id == SceneTag.tag_id)
            .where(SceneTag.scene_id.in_(scene_ids))
        ).all()
        for sid, t in tag_rows:
            tags_by_scene[sid].append(TagOut.model_validate(t))

        ref_rows = session.execute(
            select(SceneExternalRef, Source)
            .join(Source, Source.id == SceneExternalRef.source_id)
            .where(SceneExternalRef.scene_id.in_(scene_ids))
        ).all()
        for ref, src in ref_rows:
            refs_by_scene[ref.scene_id].append(
                ExternalRefOut(
                    source=src.name,
                    external_id=ref.external_id,
                    url=ref.url,
                    last_seen=ref.last_seen,
                )
            )

    # 5) Playback sources. Light mode: tylko miniaturka (jedna na scenę) — kafelek
    # czyta wyłącznie playback_sources[].thumbnail_url / animated_thumbnail_url.
    pb_by_scene: dict = defaultdict(list)
    if light:
        pb_light = session.execute(
            select(
                PlaybackSource.scene_id,
                PlaybackSource.thumbnail_url,
                PlaybackSource.animated_thumbnail_url,
                PlaybackSource.page_url,
            )
            .where(
                PlaybackSource.scene_id.in_(scene_ids),
                PlaybackSource.dead_at.is_(None),
            )
            .order_by(PlaybackSource.origin.asc())
        ).all()
        # Pierwsza miniaturka + pierwszy animated per scena (1 slim wpis).
        thumb_by_scene: dict = {}
        anim_by_scene: dict = {}
        for sid, thumb, anim, page_url in pb_light:
            if sid not in thumb_by_scene and thumb:
                thumb_by_scene[sid] = (thumb, page_url)
            if sid not in anim_by_scene and anim:
                anim_by_scene[sid] = (anim, page_url)
        for sid in scene_ids:
            t = thumb_by_scene.get(sid)
            a = anim_by_scene.get(sid)
            if not t and not a:
                continue
            t_url = t[0] if t else None
            a_url = a[0] if a else None
            ref = (t or a)[1]
            if t_url and _needs_proxy(t_url):
                t_url = _wrap_image_proxy(t_url, ref)
            if a_url and _needs_proxy(a_url):
                a_url = _wrap_image_proxy(a_url, ref)
            # id/origin/page_url wymagane przez schemat ale nieużywane przez kafelek
            # (SceneDetail re-fetchuje pełne źródła) — dummy sentinel.
            pb_by_scene[sid].append(
                PlaybackSourceOut(
                    id=uuid.UUID(int=0), origin="", page_url="",
                    thumbnail_url=t_url, animated_thumbnail_url=a_url,
                )
            )
    else:
        pb_rows = session.execute(
            select(PlaybackSource)
            .where(
                PlaybackSource.scene_id.in_(scene_ids),
                PlaybackSource.dead_at.is_(None),
            )
            .order_by(PlaybackSource.origin.asc())
        ).scalars().all()
        for p in pb_rows:
            out = PlaybackSourceOut.model_validate(p)
            if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
                out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
            if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
                out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
            pb_by_scene[p.scene_id].append(out)

    # 6) Progress
    progress_by_scene: dict = {}
    for prog in session.execute(
        select(ScenePlayProgress).where(ScenePlayProgress.scene_id.in_(scene_ids))
    ).scalars():
        progress_by_scene[prog.scene_id] = prog

    # 7) Favorites
    fav_scene_ids: set = set(
        session.execute(
            select(FavoriteScene.scene_id).where(
                FavoriteScene.scene_id.in_(scene_ids)
            )
        ).scalars()
    )

    out: list[SceneOut] = []
    for scene in scenes:
        studio_out = None
        if scene.studio_id is not None and scene.studio_id in studios_by_id:
            studio_out = StudioOut.model_validate(studios_by_id[scene.studio_id])
        progress = progress_by_scene.get(scene.id)
        out.append(
            SceneOut(
                id=scene.id,
                title=scene.title,
                slug=scene.slug,
                release_date=scene.release_date,
                duration_sec=scene.duration_sec,
                description=scene.description,
                code=scene.code,
                director=scene.director,
                studio=studio_out,
                performers=performers_by_scene.get(scene.id, []),
                tags=tags_by_scene.get(scene.id, []),
                external_refs=refs_by_scene.get(scene.id, []),
                playback_sources=pb_by_scene.get(scene.id, []),
                created_at=scene.created_at,
                last_played_at=progress.last_played_at if progress else None,
                finished=progress.finished if progress else False,
                position_sec=progress.position_sec if progress else 0,
                is_favorite=scene.id in fav_scene_ids,
            )
        )
    return out


def _build_scene_out(session: Session, scene: Scene) -> SceneOut:
    studio_out: StudioOut | None = None
    if scene.studio_id is not None:
        st = session.get(Studio, scene.studio_id)
        if st is not None:
            studio_out = StudioOut.model_validate(st)

    performer_rows = session.execute(
        select(ScenePerformer, Performer)
        .join(Performer, Performer.id == ScenePerformer.performer_id)
        .where(ScenePerformer.scene_id == scene.id)
        .order_by(ScenePerformer.position.asc().nullslast())
    ).all()
    performers_out: list[PerformerOut] = []
    for sp, performer in performer_rows:
        performers_out.append(
            PerformerOut(
                id=performer.id,
                canonical_name=performer.canonical_name,
                slug=performer.slug,
                gender=performer.gender.value if performer.gender else None,
                as_alias=sp.as_alias,
            )
        )

    tag_rows = (
        session.execute(
            select(Tag).join(SceneTag, SceneTag.tag_id == Tag.id).where(SceneTag.scene_id == scene.id)
        )
        .scalars()
        .all()
    )
    tags_out = [TagOut.model_validate(t) for t in tag_rows]

    ref_rows = session.execute(
        select(SceneExternalRef, Source)
        .join(Source, Source.id == SceneExternalRef.source_id)
        .where(SceneExternalRef.scene_id == scene.id)
    ).all()
    refs_out = [
        ExternalRefOut(
            source=src.name,
            external_id=ref.external_id,
            url=ref.url,
            last_seen=ref.last_seen,
        )
        for ref, src in ref_rows
    ]

    playback_rows = (
        session.execute(
            select(PlaybackSource)
            .where(
                PlaybackSource.scene_id == scene.id,
                PlaybackSource.dead_at.is_(None),  # ukryj martwe linki
            )
            .order_by(PlaybackSource.origin.asc())
        )
        .scalars()
        .all()
    )
    # Collapse źródła dzielące ten sam origin (hoster). Zmergowana scena często agreguje
    # kilka uploadów z JEDNEGO tube'a (re-enkody / wersje 4K: bug-report aa79a995 "2 linki,
    # oba do porntrex" = ta sama scena std+4K) — w UI to nierozróżnialne linki do tego
    # samego hostera (resolvują tym samym extractorem). Zostawiamy jeden najlepszy per
    # origin: preferuj długość zgodną ze sceną (realny match) → jakąkolwiek długość →
    # pierwszy (stabilnie, query jest origin-asc). Martwe już odfiltrowane (dead_at).
    def _origin_pick_key(p: PlaybackSource) -> tuple[int, int]:
        dur_match = (
            0 if (scene.duration_sec and p.duration_sec
                  and abs(p.duration_sec - scene.duration_sec) <= 5) else 1
        )
        return (dur_match, 0 if p.duration_sec else 1)

    _best_by_origin: dict[str, PlaybackSource] = {}
    for p in playback_rows:
        key = p.origin or ""
        cur = _best_by_origin.get(key)
        if cur is None or _origin_pick_key(p) < _origin_pick_key(cur):
            _best_by_origin[key] = p
    playback_rows = list(_best_by_origin.values())

    playback_out: list[PlaybackSourceOut] = []
    for p in playback_rows:
        out = PlaybackSourceOut.model_validate(p)
        # Wrap thumbnail URL-e przez backend image proxy gdy CDN wymaga Refera
        # (hqporner — fastporndelivery zwraca 403 bez Referer headera, expo-image
        # nie wysyła go domyślnie). Token ma 30-dniowy TTL bo thumby są stabilne.
        if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
            out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
        if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
            out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
        playback_out.append(out)

    # Rank natywne-resolve źródła PRZED WebView-fallback (IP-bound/ad-heavy: fpoxxx,
    # pornxpph, pornhub...). Query był alfabetyczny po origin, więc np. fpoxxx-WebView
    # pokazywał się przed działającym freshporno (bug-report 2026-06-07). Stabilny sort:
    # natywne (0) → fallback (1), tie-break po origin.
    from app.extractors import is_vps_blocked_fallback

    def _resolve_rank(origin: str | None) -> int:
        if not origin:
            return 1
        sitetag = origin.split(":", 1)[1] if ":" in origin else origin
        return 1 if is_vps_blocked_fallback(sitetag) else 0

    playback_out.sort(key=lambda o: (_resolve_rank(o.origin), o.origin or ""))

    progress = session.get(ScenePlayProgress, scene.id)
    is_fav = session.get(FavoriteScene, scene.id) is not None

    return SceneOut(
        id=scene.id,
        title=scene.title,
        slug=scene.slug,
        release_date=scene.release_date,
        duration_sec=scene.duration_sec,
        description=scene.description,
        code=scene.code,
        director=scene.director,
        studio=studio_out,
        performers=performers_out,
        tags=tags_out,
        external_refs=refs_out,
        playback_sources=playback_out,
        created_at=scene.created_at,
        last_played_at=progress.last_played_at if progress else None,
        finished=progress.finished if progress else False,
        position_sec=progress.position_sec if progress else 0,
        is_favorite=is_fav,
    )


@router.delete("/{scene_id}/tags/{tag_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_tag_from_scene(
    scene_id: uuid.UUID,
    tag_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> None:
    """Usuwa relację scene↔tag (np. user uznał że tag jest błędny dla tej sceny).

    Idempotent: brak relacji = success. Nie kasuje samego Tag-a — inne sceny mogą
    z niego korzystać. Sam tag zostaje w słowniku tagów.
    """
    rel = session.execute(
        select(SceneTag).where(SceneTag.scene_id == scene_id, SceneTag.tag_id == tag_id)
    ).scalar_one_or_none()
    if rel is None:
        return
    session.delete(rel)
    session.commit()


@router.delete(
    "/{scene_id}/performers/{performer_id}", status_code=status.HTTP_204_NO_CONTENT
)
def remove_performer_from_scene(
    scene_id: uuid.UUID,
    performer_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> None:
    """Usuwa relację scene↔performer (false-match dedup zostawił nie tą osobę).

    Idempotent. Sama Performer zostaje. Użyteczne np. gdy fuzzy match aliasu
    "Bella" wciągnął Anna Bella sceny pod Bad Bella, lub Miss Teela na xnxx
    została przypisana do scen w których jej nie ma (zgłoszenia 2026-05-10).
    """
    from app.models.scene import ScenePerformer

    rel = session.execute(
        select(ScenePerformer).where(
            ScenePerformer.scene_id == scene_id,
            ScenePerformer.performer_id == performer_id,
        )
    ).scalar_one_or_none()
    if rel is None:
        return
    session.delete(rel)
    session.commit()


class EnrichTagsOut(BaseModel):
    scene_id: uuid.UUID
    added: int
    tube_used: str | None
    tags: list[str]


@router.post("/{scene_id}/enrich-tags", response_model=EnrichTagsOut)
def enrich_tags_from_tube(
    scene_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> EnrichTagsOut:
    """Pobiera page HTML z dowolnego tube playback_source dla tej sceny i scrape'uje
    tagi (categories/tags). Dodaje brakujące do scene_tags.

    Mobile wywołuje to przy otwarciu SceneDetail jeśli scena ma 0 tagów AND ma
    tube source z obsługiwanym extractorem (porntrex/youporn/xvideos/xnxx/redtube/
    xhamster/eporner).

    Idempotent: ponowne wywołanie z tymi samymi tagami nic nie robi (UNIQUE PK
    scene_tags). Konkretne tube źródło wybierane wg priority listy (mainstream
    bardziej rzetelne niż aggregator).
    """
    from app.extractors._fetch import browser_get
    from app.extractors._models import TubePageError
    from app.extractors.tag_extract import EXTRACTORS, extract_tags
    from app.models.playback_source import PlaybackSource
    from app.models.tag import Tag
    from app.normalize.scenes import NormalizedTag
    from app.normalize.text import slugify
    from app.resolve.tag_resolver import resolve_tag

    scene = session.get(Scene, scene_id)
    if scene is None:
        raise HTTPException(status_code=404, detail="scene not found")

    # Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
    PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
                "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
    sources = session.execute(
        select(PlaybackSource).where(
            PlaybackSource.scene_id == scene_id,
            PlaybackSource.dead_at.is_(None),
        )
    ).scalars().all()

    # Wybierz pierwsze źródło wg priority listy które ma supported extractor
    chosen: PlaybackSource | None = None
    for tag in PRIORITY:
        for src in sources:
            if src.origin == f"tube:{tag}":
                chosen = src
                break
        if chosen:
            break
    if chosen is None:
        # Fallback: dowolne źródło z extractorem
        for src in sources:
            if src.origin.startswith("tube:"):
                sitetag = src.origin.split(":", 1)[1]
                if sitetag in EXTRACTORS:
                    chosen = src
                    break

    if chosen is None:
        return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=None, tags=[])

    sitetag = chosen.origin.split(":", 1)[1]
    try:
        r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
        r.raise_for_status()
    except (TubePageError, Exception) as e:
        log.warning("enrich-tags fetch failed for %s: %s", chosen.page_url, e)
        return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])

    tag_names = extract_tags(sitetag, r.text)
    if not tag_names:
        return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])

    # Upsert: dla każdego taga utwórz/znajdź Tag, dorzuć SceneTag idempotentnie.
    # Używamy PostgreSQL INSERT ... ON CONFLICT DO NOTHING zamiast ORM session.add()
    # bo `resolve_tag` robi session.flush() w pętli, emitując pending SceneTag INSERT
    # z poprzednich iteracji — gdy 2 concurrent enrich-tags collide na tym samym
    # (scene_id, tag_id), drugi flush dostaje UniqueViolation (GOON-H, 4 events
    # w 10h mimo wcześniejszego seen_tag_ids fix). ON CONFLICT skip'uje silently.
    from sqlalchemy.dialects.postgresql import insert as pg_insert
    added = 0
    seen_tag_ids: set = set()
    for name in tag_names:
        norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
        tag = resolve_tag(session, norm=norm)
        if tag is None or tag.id in seen_tag_ids:
            continue
        seen_tag_ids.add(tag.id)
        stmt = (
            pg_insert(SceneTag.__table__)
            .values(scene_id=scene_id, tag_id=tag.id, source_id=None)
            .on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
        )
        result = session.execute(stmt)
        # rowcount == 1 gdy faktycznie wstawiony, 0 gdy ON CONFLICT skip
        if result.rowcount and result.rowcount > 0:
            added += 1
    session.commit()
    return EnrichTagsOut(scene_id=scene_id, added=added, tube_used=sitetag, tags=tag_names)


class EnrichDurationOut(BaseModel):
    scene_id: uuid.UUID
    duration_sec: int | None
    tube_used: str | None


@router.post("/{scene_id}/enrich-duration", response_model=EnrichDurationOut)
def enrich_duration_from_tube(
    scene_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> EnrichDurationOut:
    """Wyciąga duration z dowolnego tube playback_source — wszystkie znane tube'y
    udostępniają duration na detail page (og:video:duration lub LD-JSON ISO 8601).

    Mobile wywołuje to przy otwarciu SceneDetail gdy scene.duration_sec jest null
    AND ma tube source. Dla dedupu duration to najsilniejszy single signal — bez
    niego sceny z weak title-only score są capowane na 0.85 (review queue).

    Idempotent: zwraca aktualne duration_sec jeśli już ustawione.
    """
    from app.extractors._fetch import browser_get
    from app.extractors._models import TubePageError
    from app.extractors.duration_extract import extract_duration_sec
    from app.models.playback_source import PlaybackSource

    scene = session.get(Scene, scene_id)
    if scene is None:
        raise HTTPException(status_code=404, detail="scene not found")

    if scene.duration_sec is not None:
        return EnrichDurationOut(
            scene_id=scene_id, duration_sec=scene.duration_sec, tube_used=None
        )

    sources = session.execute(
        select(PlaybackSource).where(
            PlaybackSource.scene_id == scene_id,
            PlaybackSource.dead_at.is_(None),
            PlaybackSource.origin.like("tube:%"),
        )
    ).scalars().all()

    for src in sources:
        try:
            r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
            r.raise_for_status()
        except (TubePageError, Exception) as e:
            log.debug("enrich-duration fetch failed for %s: %s", src.page_url, e)
            continue
        d = extract_duration_sec(r.text)
        if d is not None and d > 0:
            scene.duration_sec = d
            # Zapisz też na poziomie playback_source dla parity (przyda się jeśli
            # potem dorobimy per-source duration mismatch detection).
            if src.duration_sec is None:
                src.duration_sec = d
            session.commit()
            return EnrichDurationOut(
                scene_id=scene_id,
                duration_sec=d,
                tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
            )

    return EnrichDurationOut(scene_id=scene_id, duration_sec=None, tube_used=None)


class EnrichStudioOut(BaseModel):
    scene_id: uuid.UUID
    studio_id: uuid.UUID | None
    studio_name: str | None
    tube_used: str | None


@router.post("/{scene_id}/enrich-studio", response_model=EnrichStudioOut)
def enrich_studio_from_tube(
    scene_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> EnrichStudioOut:
    """Wyciąga studio (DVD/series) z pornhat scene page'a.

    Pornhat ma `class="info-video js-ajax-dvd" data-setup='{"title": "Adult Time", ...}'`
    dla studio. Inne tube'y obsługiwane będą gdy znajdziemy ich pattern — na razie
    tylko pornhat (najczystsze studio metadata wśród free tubes).
    """
    import json as _json

    from app.extractors._fetch import browser_get
    from app.extractors._models import TubePageError
    from app.models.playback_source import PlaybackSource
    from app.models.studio import Studio
    from app.normalize.text import slugify

    scene = session.get(Scene, scene_id)
    if scene is None:
        raise HTTPException(status_code=404, detail="scene not found")

    if scene.studio_id is not None:
        existing = session.get(Studio, scene.studio_id)
        return EnrichStudioOut(
            scene_id=scene_id,
            studio_id=scene.studio_id,
            studio_name=existing.name if existing else None,
            tube_used=None,
        )

    chosen = session.execute(
        select(PlaybackSource).where(
            PlaybackSource.scene_id == scene_id,
            PlaybackSource.dead_at.is_(None),
            PlaybackSource.origin == "tube:pornhatcom",
        )
    ).scalars().first()
    if chosen is None:
        return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used=None)

    try:
        r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
        r.raise_for_status()
    except (TubePageError, Exception) as e:
        log.warning("enrich-studio fetch failed for %s: %s", chosen.page_url, e)
        return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")

    m = re.search(
        r"class=\"info-video js-ajax-dvd[^\"]*\"[^>]*data-setup='([^']+)'",
        r.text, re.IGNORECASE,
    )
    if m is None:
        return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
    try:
        data = _json.loads(m.group(1))
    except _json.JSONDecodeError:
        return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")

    name = (data.get("title") or "").strip()
    if not name:
        return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
    slug = (data.get("dir") or "").strip() or slugify(name)

    studio = session.execute(
        select(Studio).where(Studio.slug == slug)
    ).scalar_one_or_none()
    if studio is None:
        studio = session.execute(
            select(Studio).where(Studio.name == name)
        ).scalar_one_or_none()
    if studio is None:
        studio = Studio(name=name, slug=slug)
        session.add(studio)
        session.flush()
    scene.studio_id = studio.id
    session.commit()
    return EnrichStudioOut(
        scene_id=scene_id, studio_id=studio.id, studio_name=studio.name, tube_used="pornhatcom"
    )


class EnrichThumbOut(BaseModel):
    scene_id: uuid.UUID
    thumbnail_url: str | None
    tube_used: str | None
    sources_updated: int


@router.post("/{scene_id}/enrich-thumbnail", response_model=EnrichThumbOut)
def enrich_thumbnail_from_tube(
    scene_id: uuid.UUID,
    session: Annotated[Session, Depends(get_session)],
) -> EnrichThumbOut:
    """Pobiera detail page z dowolnego tube playback_source bez thumbnail_url
    i wyciąga miniaturkę (og:image / twitter:image / LD-JSON thumbnailUrl /
    KVS html5player).

    Update'uje WSZYSTKIE PlaybackSource'y dla tej sceny które nie mają thumb,
    żeby kolejne otwarcia listy widziały miniaturę niezależnie od source pick.
    Mobile auto-wywoła to przy otwarciu SceneDetail bez thumb (jak duration).
    """
    from app.extractors._fetch import browser_get
    from app.extractors._models import TubePageError
    from app.extractors.thumb_extract import extract_thumbnail_url
    from app.models.playback_source import PlaybackSource

    scene = session.get(Scene, scene_id)
    if scene is None:
        raise HTTPException(status_code=404, detail="scene not found")

    sources = session.execute(
        select(PlaybackSource).where(
            PlaybackSource.scene_id == scene_id,
            PlaybackSource.dead_at.is_(None),
            PlaybackSource.origin.like("tube:%"),
        )
    ).scalars().all()

    sources_with_thumb = [s for s in sources if s.thumbnail_url]
    if sources_with_thumb:
        # już mamy — idempotent return.
        return EnrichThumbOut(
            scene_id=scene_id,
            thumbnail_url=sources_with_thumb[0].thumbnail_url,
            tube_used=None,
            sources_updated=0,
        )

    for src in sources:
        try:
            r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
            r.raise_for_status()
        except (TubePageError, Exception) as e:
            log.debug("enrich-thumbnail fetch failed for %s: %s", src.page_url, e)
            continue
        thumb = extract_thumbnail_url(r.text)
        if thumb:
            # Zapisz na wszystkich źródłach bez thumb (oszczędza duplikat fetch)
            updated = 0
            for s in sources:
                if not s.thumbnail_url:
                    s.thumbnail_url = thumb
                    updated += 1
            session.commit()
            return EnrichThumbOut(
                scene_id=scene_id,
                thumbnail_url=thumb,
                tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
                sources_updated=updated,
            )

    return EnrichThumbOut(
        scene_id=scene_id, thumbnail_url=None, tube_used=None, sources_updated=0
    )