goon/app/api/scenes.py
jtrzupek e98ef6577e feat(api): scene hide + merge-duplicate endpoints for long-press actions
POST /scenes/{id}/hide — marks all playback_sources dead so the scene drops out
of has_playback lists (reversible via dead_at; row kept for dedup/refs).
POST /scenes/{keep_id}/merge/{drop_id} — merges drop into keep via scene_merge
(moves refs/performers/tags/fingerprints/playback). Backs the new tile long-press
menu (hide / mark-duplicate) replacing the dead animated-preview gesture.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 09:47:16 +02:00

1230 lines
48 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""GET /scenes — lista i szczegóły scen z bazy kanonicznej."""
from __future__ import annotations
import logging
import re
import uuid
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel
from sqlalchemy import distinct, exists, false, func, literal_column, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.api.schemas import (
ExternalRefOut,
PerformerOut,
PlaybackSourceOut,
SceneListOut,
SceneOut,
StudioOut,
TagOut,
)
from app.db import get_session
from app.models.favorite_scene import FavoriteScene
from app.models.performer import Performer
from app.models.play_progress import ScenePlayProgress
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneExternalRef, ScenePerformer, SceneTag
from app.models.source import Source, SourceKind
from app.models.studio import Studio
from app.models.tag import Tag
log = logging.getLogger(__name__)
router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(require_api_key)])
_VALID_SORTS = {"created_at", "release_date", "title", "studio"}
# TTL-cache dla count'u scen-z-żywym-playback (default lista bez filtra). Full-scan
# 1.69M scen + EXISTS ~950ms; liczba zmienia się wolno i jest przybliżona (header
# paginacji), więc 10-min cache w pamięci procesu API jest akceptowalny trade-off.
_DEFAULT_COUNT_CACHE: dict = {"ts": 0.0, "val": 0}
_DEFAULT_COUNT_TTL = 600.0
def _default_scene_count(session: Session) -> int:
import time as _time
now = _time.monotonic()
if _DEFAULT_COUNT_CACHE["val"] and (now - _DEFAULT_COUNT_CACHE["ts"]) < _DEFAULT_COUNT_TTL:
return _DEFAULT_COUNT_CACHE["val"]
count_query = select(func.count()).select_from(
select(Scene.id).where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
).subquery()
)
total = session.execute(count_query).scalar_one()
_DEFAULT_COUNT_CACHE["ts"] = now
_DEFAULT_COUNT_CACHE["val"] = total
return total
# Blacklisty (performer/studio/tag) są zwykle PUSTE (self-hosted, single-user). Mimo to
# 3 NOT EXISTS klauzule doklejały się do KAŻDEJ filtrowanej listy scen i były ewaluowane
# per-row — przy filtrze typu duży-tag/has_playback planer chodzi po ~176k scen, więc te
# puste-zawsze klauzule kosztowały ~3.4s (mega-tag „anal": 6.7s→3.3s po pominięciu).
# Cache'ujemy emptiness (TTL 5 min); gdy ktoś doda blacklist-wpis, w ciągu 5 min klauzule
# wracają. Patrz reference_scenes_list_perf / task #22.
_BLACKLIST_EMPTY_CACHE: dict = {"ts": 0.0, "val": False, "checked": False}
_BLACKLIST_EMPTY_TTL = 300.0
def _blacklists_empty(session: Session) -> bool:
"""True gdy WSZYSTKIE 3 blacklisty puste → można pominąć NOT EXISTS klauzule."""
import time as _time
from app.models.blacklist import (
BlacklistedPerformer,
BlacklistedStudio,
BlacklistedTag,
)
now = _time.monotonic()
if _BLACKLIST_EMPTY_CACHE["checked"] and (now - _BLACKLIST_EMPTY_CACHE["ts"]) < _BLACKLIST_EMPTY_TTL:
return _BLACKLIST_EMPTY_CACHE["val"]
has_any = session.execute(
select(
exists(select(1).select_from(BlacklistedPerformer))
| exists(select(1).select_from(BlacklistedStudio))
| exists(select(1).select_from(BlacklistedTag))
)
).scalar_one()
_BLACKLIST_EMPTY_CACHE["ts"] = now
_BLACKLIST_EMPTY_CACHE["val"] = not has_any
_BLACKLIST_EMPTY_CACHE["checked"] = True
return not has_any
def _split_csv(raw: str | None) -> list[str]:
if not raw:
return []
return [s.strip() for s in raw.split(",") if s.strip()]
@router.get("", response_model=SceneListOut)
def list_scenes(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None, description="Wyszukiwanie po title_normalized (trgm)"),
studio_slug: str | None = Query(default=None, description="DEPRECATED — użyj studio_slugs"),
studio_slugs: str | None = Query(
default=None, description="Comma-separated studio slugs (OR)"
),
tags: str | None = Query(
default=None,
description="Comma-separated tag slugs (AND — scena musi mieć wszystkie wybrane tagi)",
),
performer_ids: str | None = Query(
default=None,
description="Comma-separated performer UUIDs (AND — scena musi mieć wszystkich wybranych performerów)",
),
has_playback: bool | None = Query(
default=None, description="True: tylko sceny z ≥1 playback_source"
),
has_animated_thumbnail: bool | None = Query(
default=None,
description="True: tylko sceny z ≥1 playback_source z animated_thumbnail_url (hold-to-preview)",
),
min_duration_sec: int | None = Query(default=None, ge=0),
max_duration_sec: int | None = Query(default=None, ge=0),
released_within_days: int | None = Query(
default=None, ge=1,
description="Tylko sceny released w ostatnich N dniach",
),
min_quality_p: int | None = Query(
default=None, ge=1,
description=(
"Minimum quality (pixele wysokości — 2160 = 4K, 1080 = FullHD). Filtruje "
"po PlaybackSource.quality (string typu '720p' / '1080p Full HD')."
),
),
origin: str | None = Query(
default=None,
description=(
"Filtruj po playback origin (np. 'tube:hqpornercom'). Substring match — "
"'hqporner' złapie tube:hqpornercom. Diagnostyka per-hoster."
),
),
include_stubs: bool = Query(
default=False,
description=(
"False (default): ukrywa sceny-szkielety bez release_date, < 10min, "
"z jedynym playback z hqporner (~7-min Brazzers trailer clipy zalewają katalog)."
),
),
sort: str = Query(default="created_at", description="created_at|release_date|title|studio"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=200),
) -> SceneListOut:
if sort not in _VALID_SORTS:
raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")
base = select(Scene)
if q:
base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))
studio_slug_list = _split_csv(studio_slugs)
if studio_slug:
studio_slug_list.append(studio_slug)
if studio_slug_list:
base = base.where(
Scene.studio_id.in_(
select(Studio.id).where(Studio.slug.in_(studio_slug_list))
)
)
tag_slug_list = _split_csv(tags)
# AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
# exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
#
# PERF (2026-06-07): resolvujemy slug→tag_id w aplikacji i filtrujemy po LITERALNYM
# tag_id (NIE JOIN po Tag.slug). Z literałem planner zna kardynalność tagu ze
# statystyk (MCV) → dla popularnych tagów (blowjob ~273k scen) wybiera index-walk po
# ix_scenes_created_at_desc zamiast materializować wszystkie scene_tags. Slug-JOIN
# ukrywał tag_id przed plannerem → używał średniej (8.4M/11541≈726) → zły plan
# (4-12s). Z literałem: ~20ms. Zob. też _build... light mode.
if tag_slug_list:
id_by_slug = dict(
session.execute(
select(Tag.slug, Tag.id).where(Tag.slug.in_(tag_slug_list))
).all()
)
for slug in tag_slug_list:
tag_id = id_by_slug.get(slug)
if tag_id is None:
base = base.where(false()) # nieznany slug → brak wyników
break
base = base.where(
exists(
select(1)
.select_from(SceneTag)
.where(SceneTag.scene_id == Scene.id, SceneTag.tag_id == tag_id)
)
)
perf_id_strings = _split_csv(performer_ids)
if perf_id_strings:
try:
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
except ValueError as e:
raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
# AND między performerami (analogicznie do tagów).
for pid in perf_ids:
base = base.where(
exists(
select(1)
.select_from(ScenePerformer)
.where(
ScenePerformer.scene_id == Scene.id,
ScenePerformer.performer_id == pid,
)
)
)
if has_playback is True:
# Tylko sceny z choć jednym ŻYWYM playback_source.
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
)
elif has_playback is False:
base = base.where(
~exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
)
if origin:
# Substring match na origin — 'hqporner' złapie 'tube:hqpornercom'.
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.ilike(f"%{origin}%"),
)
)
)
# Blacklisty — globalne wykluczenia. Jeśli scena ma JAKIEGOKOLWIEK blacklisted
# performera, jest na blacklisted studio, lub ma JAKIKOLWIEK blacklisted tag → out.
# Pomijamy gdy wszystkie 3 blacklisty puste (typowy stan single-user) — te NOT EXISTS
# ewaluują się per-row na ~176k scen przy mega-tagu i kosztowały ~3.4s za nic.
if not _blacklists_empty(session):
from app.models.blacklist import (
BlacklistedPerformer,
BlacklistedStudio,
BlacklistedTag,
)
base = base.where(
~exists(
select(1)
.select_from(ScenePerformer)
.join(BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id == Scene.id)
)
)
base = base.where(
~Scene.studio_id.in_(select(BlacklistedStudio.studio_id))
)
base = base.where(
~exists(
select(1)
.select_from(SceneTag)
.join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id)
.where(SceneTag.scene_id == Scene.id)
)
)
if has_animated_thumbnail:
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.animated_thumbnail_url.isnot(None),
)
)
)
if min_duration_sec is not None:
base = base.where(Scene.duration_sec >= min_duration_sec)
if max_duration_sec is not None:
base = base.where(Scene.duration_sec <= max_duration_sec)
if released_within_days is not None:
from datetime import date, timedelta
cutoff = date.today() - timedelta(days=released_within_days)
base = base.where(Scene.release_date >= cutoff)
if min_quality_p is not None:
# PlaybackSource.quality to wolny string — szukamy liczb w prefixie ('1080p',
# '1080p Full HD', '2160p'). Heurystyka: wystarczy że scena ma JEDEN żywy
# playback z quality liczbą >= min. '4K'/'UHD' aliasujemy na 2160.
from sqlalchemy import Integer, cast, or_
numeric_q = cast(
func.coalesce(func.substring(PlaybackSource.quality, r"\d+"), "0"),
Integer,
)
conds = [numeric_q >= min_quality_p]
if min_quality_p <= 2160:
conds.append(PlaybackSource.quality.ilike("%4k%"))
conds.append(PlaybackSource.quality.ilike("%uhd%"))
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.quality.isnot(None),
or_(*conds),
)
)
)
if not include_stubs:
# Stub scene heuristic: tube-only scena BEZ release_date AND BEZ canonical
# (TPDB/StashDB) ref AND BEZ żadnego ScenePerformer linka. ScenePerformer
# dodaje continuous worker (search-by-name → wymusza link), więc per-performer
# search-result NIGDY nie jest stub. To filtruje tylko anonymous tube-only
# sceny z newUrl/categories ingestu które nie zostały zsyntowane z performerem.
canonical_exists = exists(
select(1)
.select_from(SceneExternalRef)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id == Scene.id)
.where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
)
has_performer = exists(
select(1).where(ScenePerformer.scene_id == Scene.id)
)
# NOT stub gdy: ma canonical_ref OR ma release_date OR ma performera
base = base.where(
Scene.release_date.is_not(None) | canonical_exists | has_performer
)
_is_pure_default = (
not include_stubs and not q and not studio_slug_list and not tag_slug_list
and not perf_id_strings and origin is None and has_playback is None
and not has_animated_thumbnail and min_duration_sec is None
and max_duration_sec is None and released_within_days is None
and min_quality_p is None
)
# Count strategy:
# - PURE default: cached pełny licznik katalogu (TTL 10 min).
# - FILTROWANE: NIE liczymy dokładnie. Bounded-count nad EXISTS-filtrami był
# dominującym kosztem (~4s na has_playback / min_duration / duży tag) i plan
# był NIESTABILNY (literal LIMIT + count-nad-PK pomogły w części przypadków,
# ale planer i tak czasem skanuje cały zbiór zamiast urwać). Mobile paginuje
# po `has_more` (per_page+1 fetch), NIE po `total` — `total` to tylko licznik
# "N+" w UI. Wyprowadzamy go z has_more PO fetchu (patrz niżej): dolna granica
# + flaga "jest więcej". Eliminuje cały koszt count z każdej filtrowanej listy.
total_capped = False
total: int | None = _default_scene_count(session) if _is_pure_default else None
# Sort: zawsze tie-break po created_at desc dla determinizmu paginacji.
if sort == "release_date":
ordered = base.order_by(
Scene.release_date.desc().nullslast(), Scene.created_at.desc()
)
elif sort == "title":
ordered = base.order_by(Scene.title_normalized.asc(), Scene.created_at.desc())
elif sort == "studio":
# Sceny bez studio na końcu; w obrębie studio — najświeższe pierwsze.
ordered = (
base.outerjoin(Studio, Studio.id == Scene.studio_id)
.order_by(
Studio.name_normalized.asc().nullslast(),
Scene.release_date.desc().nullslast(),
Scene.created_at.desc(),
)
)
else: # created_at
ordered = base.order_by(
Scene.created_at.desc(), Scene.release_date.desc().nullslast()
)
# Fetch per_page+1 — obecność (per_page+1)-szego wiersza = jest kolejna strona.
# To źródło prawdy dla paginacji (mobile getNextPageParam), niezależne od bounded
# `total`. Nadmiarowy wiersz odcinamy przed serializacją.
# LIMIT/OFFSET literalne (NIE bound-param) — patrz wyżej: sparametryzowany LIMIT
# psuje early-termination i przy filtrach EXISTS planer robi gather-all+sort (sekundy)
# zamiast limit-aware index-walk po `ix_scenes_created_at_desc`. page/per_page to
# walidowane inty (Query ge=1, le=200), więc literal_column jest bezpieczne.
_off = (page - 1) * per_page
rows = (
session.execute(
ordered.offset(literal_column(str(_off))).limit(literal_column(str(per_page + 1)))
)
.scalars()
.all()
)
has_more = len(rows) > per_page
rows = rows[:per_page]
# Filtrowane listy: total = dolna granica z dotychczas-widzianych wierszy, a
# total_capped=has_more daje UI "N+" (jest kolejna strona). Bez osobnego count query.
if total is None:
total = (page - 1) * per_page + len(rows)
total_capped = has_more
items = _build_scenes_out_batch(session, list(rows), light=True)
return SceneListOut(
items=items,
total=total,
page=page,
per_page=per_page,
has_more=has_more,
total_capped=total_capped,
)
@router.get("/{scene_id}", response_model=SceneOut)
def get_scene(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SceneOut:
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
return _build_scene_out(session, scene)
def _needs_proxy(url: str) -> bool:
"""Wszystkie thumbnaile z playback_sources są proxowane przez backend.
Większość CDN-ów porn-tube'ów wymaga Refera (hqporner, mypornerleak/58img,
inne sxyprn/eporner CDN-y) — expo-image nie wysyła Referera.
Self-hosted lub backend-internal URL-e (zaczynające się od `/`) skipujemy."""
return url.startswith("http") and not url.startswith("/proxy/")
def _wrap_image_proxy(url: str, referer: str) -> str:
"""Wraps a thumbnail URL through /proxy/img/{token}/img.jpg. Klient nie musi
znać sekretu Referer — backend wstawi sam. Long TTL (30d) bo thumby
są stabilne, krótkie ttl by tylko niepotrzebnie zaśmiecało cache."""
from app.api.stream_proxy import make_token
# stable_bucket_sec=7d → proxied URL identyczny przez tydzień → expo-image
# disk-cache hit zamiast re-download miniatur przy każdym fetchu listy / starcie apki.
token = make_token(url, referer, ttl_sec=30 * 24 * 3600, stable_bucket_sec=7 * 24 * 3600)
# Path zachowuje rozszerzenie żeby HTTP Content-Type był rozpoznany.
import os as _os
ext = _os.path.splitext(url.split("?")[0])[1].lstrip(".") or "jpg"
return f"/proxy/img/{token}/img.{ext}"
def _build_scenes_out_batch(
session: Session, scenes: list[Scene], *, light: bool = False
) -> list[SceneOut]:
"""Batch-fetch wszystkich relacji dla N scen w 7 zapytaniach (zamiast 7×N).
Eliminuje N+1 z `_build_scene_out` w listach scen — `/scenes?per_page=24` szło
z ~9.6s do <500ms. Pojedyncza scena (`/scenes/{id}`) nadal używa `_build_scene_out`
bo overhead na batch nie ma sensu dla N=1.
`light=True` (listy/grid): pomija `tags` i `external_refs` (kafelek SceneTile ich
NIE używa, a SceneDetail re-fetchuje pełną scenę osobno) i ślimaczy `playback_sources`
do 1 wpisu z samą miniaturką (kafelek czyta tylko thumbnail_url/animated_thumbnail_url).
Mniej DB + mniej payloadu + szybszy parse na kliencie (perf 2026-06-07).
"""
from collections import defaultdict
if not scenes:
return []
scene_ids = [s.id for s in scenes]
studio_ids = list({s.studio_id for s in scenes if s.studio_id is not None})
# 1) Studios
studios_by_id: dict = {}
if studio_ids:
for st in session.execute(
select(Studio).where(Studio.id.in_(studio_ids))
).scalars():
studios_by_id[st.id] = st
# 2) Performers
perf_rows = session.execute(
select(ScenePerformer, Performer)
.join(Performer, Performer.id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id.in_(scene_ids))
.order_by(ScenePerformer.position.asc().nullslast())
).all()
performers_by_scene: dict = defaultdict(list)
for sp, p in perf_rows:
performers_by_scene[sp.scene_id].append(
PerformerOut(
id=p.id,
canonical_name=p.canonical_name,
slug=p.slug,
gender=p.gender.value if p.gender else None,
as_alias=sp.as_alias,
)
)
# 3) Tags + 4) External refs — kafelek listy ich nie używa; w light mode pomijamy
# (SceneDetail re-fetchuje pełną scenę przez /scenes/{id}).
tags_by_scene: dict = defaultdict(list)
refs_by_scene: dict = defaultdict(list)
if not light:
tag_rows = session.execute(
select(SceneTag.scene_id, Tag)
.join(Tag, Tag.id == SceneTag.tag_id)
.where(SceneTag.scene_id.in_(scene_ids))
).all()
for sid, t in tag_rows:
tags_by_scene[sid].append(TagOut.model_validate(t))
ref_rows = session.execute(
select(SceneExternalRef, Source)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id.in_(scene_ids))
).all()
for ref, src in ref_rows:
refs_by_scene[ref.scene_id].append(
ExternalRefOut(
source=src.name,
external_id=ref.external_id,
url=ref.url,
last_seen=ref.last_seen,
)
)
# 5) Playback sources. Light mode: tylko miniaturka (jedna na scenę) — kafelek
# czyta wyłącznie playback_sources[].thumbnail_url / animated_thumbnail_url.
pb_by_scene: dict = defaultdict(list)
if light:
pb_light = session.execute(
select(
PlaybackSource.scene_id,
PlaybackSource.thumbnail_url,
PlaybackSource.animated_thumbnail_url,
PlaybackSource.page_url,
)
.where(
PlaybackSource.scene_id.in_(scene_ids),
PlaybackSource.dead_at.is_(None),
)
.order_by(PlaybackSource.origin.asc())
).all()
# Pierwsza miniaturka + pierwszy animated per scena (1 slim wpis).
thumb_by_scene: dict = {}
anim_by_scene: dict = {}
for sid, thumb, anim, page_url in pb_light:
if sid not in thumb_by_scene and thumb:
thumb_by_scene[sid] = (thumb, page_url)
if sid not in anim_by_scene and anim:
anim_by_scene[sid] = (anim, page_url)
for sid in scene_ids:
t = thumb_by_scene.get(sid)
a = anim_by_scene.get(sid)
if not t and not a:
continue
t_url = t[0] if t else None
a_url = a[0] if a else None
ref = (t or a)[1]
if t_url and _needs_proxy(t_url):
t_url = _wrap_image_proxy(t_url, ref)
if a_url and _needs_proxy(a_url):
a_url = _wrap_image_proxy(a_url, ref)
# id/origin/page_url wymagane przez schemat ale nieużywane przez kafelek
# (SceneDetail re-fetchuje pełne źródła) — dummy sentinel.
pb_by_scene[sid].append(
PlaybackSourceOut(
id=uuid.UUID(int=0), origin="", page_url="",
thumbnail_url=t_url, animated_thumbnail_url=a_url,
)
)
else:
pb_rows = session.execute(
select(PlaybackSource)
.where(
PlaybackSource.scene_id.in_(scene_ids),
PlaybackSource.dead_at.is_(None),
)
.order_by(PlaybackSource.origin.asc())
).scalars().all()
for p in pb_rows:
out = PlaybackSourceOut.model_validate(p)
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
pb_by_scene[p.scene_id].append(out)
# 6) Progress
progress_by_scene: dict = {}
for prog in session.execute(
select(ScenePlayProgress).where(ScenePlayProgress.scene_id.in_(scene_ids))
).scalars():
progress_by_scene[prog.scene_id] = prog
# 7) Favorites
fav_scene_ids: set = set(
session.execute(
select(FavoriteScene.scene_id).where(
FavoriteScene.scene_id.in_(scene_ids)
)
).scalars()
)
out: list[SceneOut] = []
for scene in scenes:
studio_out = None
if scene.studio_id is not None and scene.studio_id in studios_by_id:
studio_out = StudioOut.model_validate(studios_by_id[scene.studio_id])
progress = progress_by_scene.get(scene.id)
out.append(
SceneOut(
id=scene.id,
title=scene.title,
slug=scene.slug,
release_date=scene.release_date,
duration_sec=scene.duration_sec,
description=scene.description,
code=scene.code,
director=scene.director,
studio=studio_out,
performers=performers_by_scene.get(scene.id, []),
tags=tags_by_scene.get(scene.id, []),
external_refs=refs_by_scene.get(scene.id, []),
playback_sources=pb_by_scene.get(scene.id, []),
created_at=scene.created_at,
last_played_at=progress.last_played_at if progress else None,
finished=progress.finished if progress else False,
position_sec=progress.position_sec if progress else 0,
is_favorite=scene.id in fav_scene_ids,
)
)
return out
def _build_scene_out(session: Session, scene: Scene) -> SceneOut:
studio_out: StudioOut | None = None
if scene.studio_id is not None:
st = session.get(Studio, scene.studio_id)
if st is not None:
studio_out = StudioOut.model_validate(st)
performer_rows = session.execute(
select(ScenePerformer, Performer)
.join(Performer, Performer.id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id == scene.id)
.order_by(ScenePerformer.position.asc().nullslast())
).all()
performers_out: list[PerformerOut] = []
for sp, performer in performer_rows:
performers_out.append(
PerformerOut(
id=performer.id,
canonical_name=performer.canonical_name,
slug=performer.slug,
gender=performer.gender.value if performer.gender else None,
as_alias=sp.as_alias,
)
)
tag_rows = (
session.execute(
select(Tag).join(SceneTag, SceneTag.tag_id == Tag.id).where(SceneTag.scene_id == scene.id)
)
.scalars()
.all()
)
tags_out = [TagOut.model_validate(t) for t in tag_rows]
ref_rows = session.execute(
select(SceneExternalRef, Source)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id == scene.id)
).all()
refs_out = [
ExternalRefOut(
source=src.name,
external_id=ref.external_id,
url=ref.url,
last_seen=ref.last_seen,
)
for ref, src in ref_rows
]
playback_rows = (
session.execute(
select(PlaybackSource)
.where(
PlaybackSource.scene_id == scene.id,
PlaybackSource.dead_at.is_(None), # ukryj martwe linki
)
.order_by(PlaybackSource.origin.asc())
)
.scalars()
.all()
)
# Collapse źródła dzielące ten sam origin (hoster). Zmergowana scena często agreguje
# kilka uploadów z JEDNEGO tube'a (re-enkody / wersje 4K: bug-report aa79a995 "2 linki,
# oba do porntrex" = ta sama scena std+4K) — w UI to nierozróżnialne linki do tego
# samego hostera (resolvują tym samym extractorem). Zostawiamy jeden najlepszy per
# origin: preferuj długość zgodną ze sceną (realny match) → jakąkolwiek długość →
# pierwszy (stabilnie, query jest origin-asc). Martwe już odfiltrowane (dead_at).
def _origin_pick_key(p: PlaybackSource) -> tuple[int, int]:
dur_match = (
0 if (scene.duration_sec and p.duration_sec
and abs(p.duration_sec - scene.duration_sec) <= 5) else 1
)
return (dur_match, 0 if p.duration_sec else 1)
_best_by_origin: dict[str, PlaybackSource] = {}
for p in playback_rows:
key = p.origin or ""
cur = _best_by_origin.get(key)
if cur is None or _origin_pick_key(p) < _origin_pick_key(cur):
_best_by_origin[key] = p
playback_rows = list(_best_by_origin.values())
playback_out: list[PlaybackSourceOut] = []
for p in playback_rows:
out = PlaybackSourceOut.model_validate(p)
# Wrap thumbnail URL-e przez backend image proxy gdy CDN wymaga Refera
# (hqporner — fastporndelivery zwraca 403 bez Referer headera, expo-image
# nie wysyła go domyślnie). Token ma 30-dniowy TTL bo thumby są stabilne.
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
playback_out.append(out)
# Rank natywne-resolve źródła PRZED WebView-fallback (IP-bound/ad-heavy: fpoxxx,
# pornxpph, pornhub...). Query był alfabetyczny po origin, więc np. fpoxxx-WebView
# pokazywał się przed działającym freshporno (bug-report 2026-06-07). Stabilny sort:
# natywne (0) → fallback (1), tie-break po origin.
from app.extractors import is_vps_blocked_fallback
def _resolve_rank(origin: str | None) -> int:
if not origin:
return 1
sitetag = origin.split(":", 1)[1] if ":" in origin else origin
return 1 if is_vps_blocked_fallback(sitetag) else 0
playback_out.sort(key=lambda o: (_resolve_rank(o.origin), o.origin or ""))
progress = session.get(ScenePlayProgress, scene.id)
is_fav = session.get(FavoriteScene, scene.id) is not None
return SceneOut(
id=scene.id,
title=scene.title,
slug=scene.slug,
release_date=scene.release_date,
duration_sec=scene.duration_sec,
description=scene.description,
code=scene.code,
director=scene.director,
studio=studio_out,
performers=performers_out,
tags=tags_out,
external_refs=refs_out,
playback_sources=playback_out,
created_at=scene.created_at,
last_played_at=progress.last_played_at if progress else None,
finished=progress.finished if progress else False,
position_sec=progress.position_sec if progress else 0,
is_favorite=is_fav,
)
@router.delete("/{scene_id}/tags/{tag_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_tag_from_scene(
scene_id: uuid.UUID,
tag_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""Usuwa relację scene↔tag (np. user uznał że tag jest błędny dla tej sceny).
Idempotent: brak relacji = success. Nie kasuje samego Tag-a — inne sceny mogą
z niego korzystać. Sam tag zostaje w słowniku tagów.
"""
rel = session.execute(
select(SceneTag).where(SceneTag.scene_id == scene_id, SceneTag.tag_id == tag_id)
).scalar_one_or_none()
if rel is None:
return
session.delete(rel)
session.commit()
@router.delete(
"/{scene_id}/performers/{performer_id}", status_code=status.HTTP_204_NO_CONTENT
)
def remove_performer_from_scene(
scene_id: uuid.UUID,
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""Usuwa relację scene↔performer (false-match dedup zostawił nie tą osobę).
Idempotent. Sama Performer zostaje. Użyteczne np. gdy fuzzy match aliasu
"Bella" wciągnął Anna Bella sceny pod Bad Bella, lub Miss Teela na xnxx
została przypisana do scen w których jej nie ma (zgłoszenia 2026-05-10).
"""
from app.models.scene import ScenePerformer
rel = session.execute(
select(ScenePerformer).where(
ScenePerformer.scene_id == scene_id,
ScenePerformer.performer_id == performer_id,
)
).scalar_one_or_none()
if rel is None:
return
session.delete(rel)
session.commit()
class SceneHideOut(BaseModel):
scene_id: uuid.UUID
playback_marked_dead: int
@router.post("/{scene_id}/hide", response_model=SceneHideOut)
def hide_scene(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SceneHideOut:
"""Ukryj scenę (user long-press → „usuń"). Oznacza wszystkie playback_sources
jako dead → scena wypada z list (has_playback=false). Odwracalne w DB (dead_at).
Nie kasujemy wiersza sceny — zachowujemy refs/dedup, tylko znika z UI."""
from datetime import UTC, datetime
from app.models.playback_source import PlaybackSource
if session.get(Scene, scene_id) is None:
raise HTTPException(status_code=404, detail="scene not found")
rows = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
)
).scalars().all()
now = datetime.now(UTC)
for p in rows:
p.dead_at = now
p.dead_reason = "user hid scene (long-press)"
session.commit()
return SceneHideOut(scene_id=scene_id, playback_marked_dead=len(rows))
class SceneMergeOut(BaseModel):
keep_id: uuid.UUID
dropped_id: uuid.UUID
@router.post("/{keep_id}/merge/{drop_id}", response_model=SceneMergeOut)
def merge_duplicate_scene(
keep_id: uuid.UUID,
drop_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SceneMergeOut:
"""Scal `drop_id` w `keep_id` (user long-press → „oznacz duplikat" → wybór drugiej
sceny). Przenosi refs/performers/tags/fingerprints/playback (scene_merge), kasuje
`drop`. keep = scena na której user trzyma (zostaje), drop = wskazany duplikat."""
from app.resolve.scene_merge import MergeError, merge_scenes
if keep_id == drop_id:
raise HTTPException(status_code=400, detail="keep_id == drop_id")
if session.get(Scene, keep_id) is None or session.get(Scene, drop_id) is None:
raise HTTPException(status_code=404, detail="scene not found")
try:
merge_scenes(session, keep_id=keep_id, drop_id=drop_id, resolved_by="user_long_press_duplicate")
except MergeError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
session.commit()
return SceneMergeOut(keep_id=keep_id, dropped_id=drop_id)
class EnrichTagsOut(BaseModel):
scene_id: uuid.UUID
added: int
tube_used: str | None
tags: list[str]
@router.post("/{scene_id}/enrich-tags", response_model=EnrichTagsOut)
def enrich_tags_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichTagsOut:
"""Pobiera page HTML z dowolnego tube playback_source dla tej sceny i scrape'uje
tagi (categories/tags). Dodaje brakujące do scene_tags.
Mobile wywołuje to przy otwarciu SceneDetail jeśli scena ma 0 tagów AND ma
tube source z obsługiwanym extractorem (porntrex/youporn/xvideos/xnxx/redtube/
xhamster/eporner).
Idempotent: ponowne wywołanie z tymi samymi tagami nic nie robi (UNIQUE PK
scene_tags). Konkretne tube źródło wybierane wg priority listy (mainstream
bardziej rzetelne niż aggregator).
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.tag_extract import EXTRACTORS, extract_tags
from app.models.playback_source import PlaybackSource
from app.models.tag import Tag
from app.normalize.scenes import NormalizedTag
from app.normalize.text import slugify
from app.resolve.tag_resolver import resolve_tag
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
)
).scalars().all()
# Wybierz pierwsze źródło wg priority listy które ma supported extractor
chosen: PlaybackSource | None = None
for tag in PRIORITY:
for src in sources:
if src.origin == f"tube:{tag}":
chosen = src
break
if chosen:
break
if chosen is None:
# Fallback: dowolne źródło z extractorem
for src in sources:
if src.origin.startswith("tube:"):
sitetag = src.origin.split(":", 1)[1]
if sitetag in EXTRACTORS:
chosen = src
break
if chosen is None:
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=None, tags=[])
sitetag = chosen.origin.split(":", 1)[1]
try:
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.warning("enrich-tags fetch failed for %s: %s", chosen.page_url, e)
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
tag_names = extract_tags(sitetag, r.text)
if not tag_names:
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
# Upsert: dla każdego taga utwórz/znajdź Tag, dorzuć SceneTag idempotentnie.
# Używamy PostgreSQL INSERT ... ON CONFLICT DO NOTHING zamiast ORM session.add()
# bo `resolve_tag` robi session.flush() w pętli, emitując pending SceneTag INSERT
# z poprzednich iteracji — gdy 2 concurrent enrich-tags collide na tym samym
# (scene_id, tag_id), drugi flush dostaje UniqueViolation (GOON-H, 4 events
# w 10h mimo wcześniejszego seen_tag_ids fix). ON CONFLICT skip'uje silently.
from sqlalchemy.dialects.postgresql import insert as pg_insert
added = 0
seen_tag_ids: set = set()
for name in tag_names:
norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
tag = resolve_tag(session, norm=norm)
if tag is None or tag.id in seen_tag_ids:
continue
seen_tag_ids.add(tag.id)
stmt = (
pg_insert(SceneTag.__table__)
.values(scene_id=scene_id, tag_id=tag.id, source_id=None)
.on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
)
result = session.execute(stmt)
# rowcount == 1 gdy faktycznie wstawiony, 0 gdy ON CONFLICT skip
if result.rowcount and result.rowcount > 0:
added += 1
session.commit()
return EnrichTagsOut(scene_id=scene_id, added=added, tube_used=sitetag, tags=tag_names)
class EnrichDurationOut(BaseModel):
scene_id: uuid.UUID
duration_sec: int | None
tube_used: str | None
@router.post("/{scene_id}/enrich-duration", response_model=EnrichDurationOut)
def enrich_duration_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichDurationOut:
"""Wyciąga duration z dowolnego tube playback_source — wszystkie znane tube'y
udostępniają duration na detail page (og:video:duration lub LD-JSON ISO 8601).
Mobile wywołuje to przy otwarciu SceneDetail gdy scene.duration_sec jest null
AND ma tube source. Dla dedupu duration to najsilniejszy single signal — bez
niego sceny z weak title-only score są capowane na 0.85 (review queue).
Idempotent: zwraca aktualne duration_sec jeśli już ustawione.
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.duration_extract import extract_duration_sec
from app.models.playback_source import PlaybackSource
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
if scene.duration_sec is not None:
return EnrichDurationOut(
scene_id=scene_id, duration_sec=scene.duration_sec, tube_used=None
)
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.like("tube:%"),
)
).scalars().all()
for src in sources:
try:
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.debug("enrich-duration fetch failed for %s: %s", src.page_url, e)
continue
d = extract_duration_sec(r.text)
if d is not None and d > 0:
scene.duration_sec = d
# Zapisz też na poziomie playback_source dla parity (przyda się jeśli
# potem dorobimy per-source duration mismatch detection).
if src.duration_sec is None:
src.duration_sec = d
session.commit()
return EnrichDurationOut(
scene_id=scene_id,
duration_sec=d,
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
)
return EnrichDurationOut(scene_id=scene_id, duration_sec=None, tube_used=None)
class EnrichStudioOut(BaseModel):
scene_id: uuid.UUID
studio_id: uuid.UUID | None
studio_name: str | None
tube_used: str | None
@router.post("/{scene_id}/enrich-studio", response_model=EnrichStudioOut)
def enrich_studio_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichStudioOut:
"""Wyciąga studio (DVD/series) z pornhat scene page'a.
Pornhat ma `class="info-video js-ajax-dvd" data-setup='{"title": "Adult Time", ...}'`
dla studio. Inne tube'y obsługiwane będą gdy znajdziemy ich pattern — na razie
tylko pornhat (najczystsze studio metadata wśród free tubes).
"""
import json as _json
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.models.playback_source import PlaybackSource
from app.models.studio import Studio
from app.normalize.text import slugify
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
if scene.studio_id is not None:
existing = session.get(Studio, scene.studio_id)
return EnrichStudioOut(
scene_id=scene_id,
studio_id=scene.studio_id,
studio_name=existing.name if existing else None,
tube_used=None,
)
chosen = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin == "tube:pornhatcom",
)
).scalars().first()
if chosen is None:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used=None)
try:
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.warning("enrich-studio fetch failed for %s: %s", chosen.page_url, e)
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
m = re.search(
r"class=\"info-video js-ajax-dvd[^\"]*\"[^>]*data-setup='([^']+)'",
r.text, re.IGNORECASE,
)
if m is None:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
try:
data = _json.loads(m.group(1))
except _json.JSONDecodeError:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
name = (data.get("title") or "").strip()
if not name:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
slug = (data.get("dir") or "").strip() or slugify(name)
studio = session.execute(
select(Studio).where(Studio.slug == slug)
).scalar_one_or_none()
if studio is None:
studio = session.execute(
select(Studio).where(Studio.name == name)
).scalar_one_or_none()
if studio is None:
studio = Studio(name=name, slug=slug)
session.add(studio)
session.flush()
scene.studio_id = studio.id
session.commit()
return EnrichStudioOut(
scene_id=scene_id, studio_id=studio.id, studio_name=studio.name, tube_used="pornhatcom"
)
class EnrichThumbOut(BaseModel):
scene_id: uuid.UUID
thumbnail_url: str | None
tube_used: str | None
sources_updated: int
@router.post("/{scene_id}/enrich-thumbnail", response_model=EnrichThumbOut)
def enrich_thumbnail_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichThumbOut:
"""Pobiera detail page z dowolnego tube playback_source bez thumbnail_url
i wyciąga miniaturkę (og:image / twitter:image / LD-JSON thumbnailUrl /
KVS html5player).
Update'uje WSZYSTKIE PlaybackSource'y dla tej sceny które nie mają thumb,
żeby kolejne otwarcia listy widziały miniaturę niezależnie od source pick.
Mobile auto-wywoła to przy otwarciu SceneDetail bez thumb (jak duration).
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.thumb_extract import extract_thumbnail_url
from app.models.playback_source import PlaybackSource
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.like("tube:%"),
)
).scalars().all()
sources_with_thumb = [s for s in sources if s.thumbnail_url]
if sources_with_thumb:
# już mamy — idempotent return.
return EnrichThumbOut(
scene_id=scene_id,
thumbnail_url=sources_with_thumb[0].thumbnail_url,
tube_used=None,
sources_updated=0,
)
for src in sources:
try:
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.debug("enrich-thumbnail fetch failed for %s: %s", src.page_url, e)
continue
thumb = extract_thumbnail_url(r.text)
if thumb:
# Zapisz na wszystkich źródłach bez thumb (oszczędza duplikat fetch)
updated = 0
for s in sources:
if not s.thumbnail_url:
s.thumbnail_url = thumb
updated += 1
session.commit()
return EnrichThumbOut(
scene_id=scene_id,
thumbnail_url=thumb,
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
sources_updated=updated,
)
return EnrichThumbOut(
scene_id=scene_id, thumbnail_url=None, tube_used=None, sources_updated=0
)