"""GET /scenes — lista i szczegóły scen z bazy kanonicznej.""" from __future__ import annotations import logging import re import uuid from typing import Annotated from fastapi import APIRouter, Depends, HTTPException, Query, status from pydantic import BaseModel from sqlalchemy import distinct, exists, func, select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from app.auth import require_api_key from app.api.schemas import ( ExternalRefOut, PerformerOut, PlaybackSourceOut, SceneListOut, SceneOut, StudioOut, TagOut, ) from app.db import get_session from app.models.favorite_scene import FavoriteScene from app.models.performer import Performer from app.models.play_progress import ScenePlayProgress from app.models.playback_source import PlaybackSource from app.models.scene import Scene, SceneExternalRef, ScenePerformer, SceneTag from app.models.source import Source, SourceKind from app.models.studio import Studio from app.models.tag import Tag log = logging.getLogger(__name__) router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(require_api_key)]) _VALID_SORTS = {"created_at", "release_date", "title", "studio"} def _split_csv(raw: str | None) -> list[str]: if not raw: return [] return [s.strip() for s in raw.split(",") if s.strip()] @router.get("", response_model=SceneListOut) def list_scenes( session: Annotated[Session, Depends(get_session)], q: str | None = Query(default=None, description="Wyszukiwanie po title_normalized (trgm)"), studio_slug: str | None = Query(default=None, description="DEPRECATED — użyj studio_slugs"), studio_slugs: str | None = Query( default=None, description="Comma-separated studio slugs (OR)" ), tags: str | None = Query( default=None, description="Comma-separated tag slugs (AND — scena musi mieć wszystkie wybrane tagi)", ), performer_ids: str | None = Query( default=None, description="Comma-separated performer UUIDs (AND — scena musi mieć wszystkich wybranych performerów)", ), has_playback: bool | None = Query( default=None, description="True: tylko sceny z ≥1 playback_source" ), has_animated_thumbnail: bool | None = Query( default=None, description="True: tylko sceny z ≥1 playback_source z animated_thumbnail_url (hold-to-preview)", ), min_duration_sec: int | None = Query(default=None, ge=0), max_duration_sec: int | None = Query(default=None, ge=0), released_within_days: int | None = Query( default=None, ge=1, description="Tylko sceny released w ostatnich N dniach", ), min_quality_p: int | None = Query( default=None, ge=1, description=( "Minimum quality (pixele wysokości — 2160 = 4K, 1080 = FullHD). Filtruje " "po PlaybackSource.quality (string typu '720p' / '1080p Full HD')." ), ), origin: str | None = Query( default=None, description=( "Filtruj po playback origin (np. 'tube:hqpornercom'). Substring match — " "'hqporner' złapie tube:hqpornercom. Diagnostyka per-hoster." ), ), include_stubs: bool = Query( default=False, description=( "False (default): ukrywa sceny-szkielety bez release_date, < 10min, " "z jedynym playback z hqporner (~7-min Brazzers trailer clipy zalewają katalog)." ), ), sort: str = Query(default="created_at", description="created_at|release_date|title|studio"), page: int = Query(default=1, ge=1), per_page: int = Query(default=50, ge=1, le=200), ) -> SceneListOut: if sort not in _VALID_SORTS: raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}") base = select(Scene) if q: base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%")) studio_slug_list = _split_csv(studio_slugs) if studio_slug: studio_slug_list.append(studio_slug) if studio_slug_list: base = base.where( Scene.studio_id.in_( select(Studio.id).where(Studio.slug.in_(studio_slug_list)) ) ) tag_slug_list = _split_csv(tags) # AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny # exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika. for slug in tag_slug_list: base = base.where( exists( select(1) .select_from(SceneTag) .join(Tag, Tag.id == SceneTag.tag_id) .where(SceneTag.scene_id == Scene.id, Tag.slug == slug) ) ) perf_id_strings = _split_csv(performer_ids) if perf_id_strings: try: perf_ids = [uuid.UUID(s) for s in perf_id_strings] except ValueError as e: raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e # AND między performerami (analogicznie do tagów). for pid in perf_ids: base = base.where( exists( select(1) .select_from(ScenePerformer) .where( ScenePerformer.scene_id == Scene.id, ScenePerformer.performer_id == pid, ) ) ) if has_playback is True: # Tylko sceny z choć jednym ŻYWYM playback_source. base = base.where( exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), ) ) ) elif has_playback is False: base = base.where( ~exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), ) ) ) if origin: # Substring match na origin — 'hqporner' złapie 'tube:hqpornercom'. base = base.where( exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), PlaybackSource.origin.ilike(f"%{origin}%"), ) ) ) # Blacklisty — globalne wykluczenia. Jeśli scena ma JAKIEGOKOLWIEK blacklisted # performera, jest na blacklisted studio, lub ma JAKIKOLWIEK blacklisted tag → out. from app.models.blacklist import ( BlacklistedPerformer, BlacklistedStudio, BlacklistedTag, ) base = base.where( ~exists( select(1) .select_from(ScenePerformer) .join(BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id) .where(ScenePerformer.scene_id == Scene.id) ) ) base = base.where( ~Scene.studio_id.in_(select(BlacklistedStudio.studio_id)) ) base = base.where( ~exists( select(1) .select_from(SceneTag) .join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id) .where(SceneTag.scene_id == Scene.id) ) ) if has_animated_thumbnail: base = base.where( exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), PlaybackSource.animated_thumbnail_url.isnot(None), ) ) ) if min_duration_sec is not None: base = base.where(Scene.duration_sec >= min_duration_sec) if max_duration_sec is not None: base = base.where(Scene.duration_sec <= max_duration_sec) if released_within_days is not None: from datetime import date, timedelta cutoff = date.today() - timedelta(days=released_within_days) base = base.where(Scene.release_date >= cutoff) if min_quality_p is not None: # PlaybackSource.quality to wolny string — szukamy liczb w prefixie ('1080p', # '1080p Full HD', '2160p'). Heurystyka: wystarczy że scena ma JEDEN żywy # playback z quality liczbą >= min. '4K'/'UHD' aliasujemy na 2160. from sqlalchemy import Integer, cast, or_ numeric_q = cast( func.coalesce(func.substring(PlaybackSource.quality, r"\d+"), "0"), Integer, ) conds = [numeric_q >= min_quality_p] if min_quality_p <= 2160: conds.append(PlaybackSource.quality.ilike("%4k%")) conds.append(PlaybackSource.quality.ilike("%uhd%")) base = base.where( exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), PlaybackSource.quality.isnot(None), or_(*conds), ) ) ) if not include_stubs: # Stub scene heuristic: tube-only scena BEZ release_date AND BEZ canonical # (TPDB/StashDB) ref AND BEZ żadnego ScenePerformer linka. ScenePerformer # dodaje continuous worker (search-by-name → wymusza link), więc per-performer # search-result NIGDY nie jest stub. To filtruje tylko anonymous tube-only # sceny z newUrl/categories ingestu które nie zostały zsyntowane z performerem. canonical_exists = exists( select(1) .select_from(SceneExternalRef) .join(Source, Source.id == SceneExternalRef.source_id) .where(SceneExternalRef.scene_id == Scene.id) .where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb])) ) has_performer = exists( select(1).where(ScenePerformer.scene_id == Scene.id) ) # NOT stub gdy: ma canonical_ref OR ma release_date OR ma performera base = base.where( Scene.release_date.is_not(None) | canonical_exists | has_performer ) # Count: dla dużych baz (~400k scen) pełny count z 3 nested EXISTS bierze ~5s. # Liczymy total na uproszczonym query (bez stub-filter w count) — daje ~5% off # ale jest akceptowalne dla user-facing pagination header. Items query NADAL # ma stub-filter, więc lista pokazuje poprawne sceny. Liczba w header jest # przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie. if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings: # Fast path: typowy default request (lista bez filtra) — count tylko po # has_playback (single EXISTS, dobrze zindeksowany). count_query = select(func.count()).select_from( select(Scene.id).where( exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), ) ) ).subquery() ) total = session.execute(count_query).scalar_one() else: total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one() # Sort: zawsze tie-break po created_at desc dla determinizmu paginacji. if sort == "release_date": ordered = base.order_by( Scene.release_date.desc().nullslast(), Scene.created_at.desc() ) elif sort == "title": ordered = base.order_by(Scene.title_normalized.asc(), Scene.created_at.desc()) elif sort == "studio": # Sceny bez studio na końcu; w obrębie studio — najświeższe pierwsze. ordered = ( base.outerjoin(Studio, Studio.id == Scene.studio_id) .order_by( Studio.name_normalized.asc().nullslast(), Scene.release_date.desc().nullslast(), Scene.created_at.desc(), ) ) else: # created_at ordered = base.order_by( Scene.created_at.desc(), Scene.release_date.desc().nullslast() ) rows = ( session.execute(ordered.offset((page - 1) * per_page).limit(per_page)) .scalars() .all() ) items = _build_scenes_out_batch(session, list(rows)) return SceneListOut(items=items, total=total, page=page, per_page=per_page) @router.get("/{scene_id}", response_model=SceneOut) def get_scene( scene_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> SceneOut: scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") return _build_scene_out(session, scene) def _needs_proxy(url: str) -> bool: """Wszystkie thumbnaile z playback_sources są proxowane przez backend. Większość CDN-ów porn-tube'ów wymaga Refera (hqporner, mypornerleak/58img, inne sxyprn/eporner CDN-y) — expo-image nie wysyła Referera. Self-hosted lub backend-internal URL-e (zaczynające się od `/`) skipujemy.""" return url.startswith("http") and not url.startswith("/proxy/") def _wrap_image_proxy(url: str, referer: str) -> str: """Wraps a thumbnail URL through /proxy/img/{token}/img.jpg. Klient nie musi znać sekretu Referer — backend wstawi sam. Long TTL (30d) bo thumby są stabilne, krótkie ttl by tylko niepotrzebnie zaśmiecało cache.""" from app.api.stream_proxy import make_token token = make_token(url, referer, ttl_sec=30 * 24 * 3600) # Path zachowuje rozszerzenie żeby HTTP Content-Type był rozpoznany. import os as _os ext = _os.path.splitext(url.split("?")[0])[1].lstrip(".") or "jpg" return f"/proxy/img/{token}/img.{ext}" def _build_scenes_out_batch(session: Session, scenes: list[Scene]) -> list[SceneOut]: """Batch-fetch wszystkich relacji dla N scen w 7 zapytaniach (zamiast 7×N). Eliminuje N+1 z `_build_scene_out` w listach scen — `/scenes?per_page=24` szło z ~9.6s do <500ms. Pojedyncza scena (`/scenes/{id}`) nadal używa `_build_scene_out` bo overhead na batch nie ma sensu dla N=1. """ from collections import defaultdict if not scenes: return [] scene_ids = [s.id for s in scenes] studio_ids = list({s.studio_id for s in scenes if s.studio_id is not None}) # 1) Studios studios_by_id: dict = {} if studio_ids: for st in session.execute( select(Studio).where(Studio.id.in_(studio_ids)) ).scalars(): studios_by_id[st.id] = st # 2) Performers perf_rows = session.execute( select(ScenePerformer, Performer) .join(Performer, Performer.id == ScenePerformer.performer_id) .where(ScenePerformer.scene_id.in_(scene_ids)) .order_by(ScenePerformer.position.asc().nullslast()) ).all() performers_by_scene: dict = defaultdict(list) for sp, p in perf_rows: performers_by_scene[sp.scene_id].append( PerformerOut( id=p.id, canonical_name=p.canonical_name, slug=p.slug, gender=p.gender.value if p.gender else None, as_alias=sp.as_alias, ) ) # 3) Tags tag_rows = session.execute( select(SceneTag.scene_id, Tag) .join(Tag, Tag.id == SceneTag.tag_id) .where(SceneTag.scene_id.in_(scene_ids)) ).all() tags_by_scene: dict = defaultdict(list) for sid, t in tag_rows: tags_by_scene[sid].append(TagOut.model_validate(t)) # 4) External refs + sources ref_rows = session.execute( select(SceneExternalRef, Source) .join(Source, Source.id == SceneExternalRef.source_id) .where(SceneExternalRef.scene_id.in_(scene_ids)) ).all() refs_by_scene: dict = defaultdict(list) for ref, src in ref_rows: refs_by_scene[ref.scene_id].append( ExternalRefOut( source=src.name, external_id=ref.external_id, url=ref.url, last_seen=ref.last_seen, ) ) # 5) Playback sources pb_rows = session.execute( select(PlaybackSource) .where( PlaybackSource.scene_id.in_(scene_ids), PlaybackSource.dead_at.is_(None), ) .order_by(PlaybackSource.origin.asc()) ).scalars().all() pb_by_scene: dict = defaultdict(list) for p in pb_rows: out = PlaybackSourceOut.model_validate(p) if out.thumbnail_url and _needs_proxy(out.thumbnail_url): out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url) if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url): out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url) pb_by_scene[p.scene_id].append(out) # 6) Progress progress_by_scene: dict = {} for prog in session.execute( select(ScenePlayProgress).where(ScenePlayProgress.scene_id.in_(scene_ids)) ).scalars(): progress_by_scene[prog.scene_id] = prog # 7) Favorites fav_scene_ids: set = set( session.execute( select(FavoriteScene.scene_id).where( FavoriteScene.scene_id.in_(scene_ids) ) ).scalars() ) out: list[SceneOut] = [] for scene in scenes: studio_out = None if scene.studio_id is not None and scene.studio_id in studios_by_id: studio_out = StudioOut.model_validate(studios_by_id[scene.studio_id]) progress = progress_by_scene.get(scene.id) out.append( SceneOut( id=scene.id, title=scene.title, slug=scene.slug, release_date=scene.release_date, duration_sec=scene.duration_sec, description=scene.description, code=scene.code, director=scene.director, studio=studio_out, performers=performers_by_scene.get(scene.id, []), tags=tags_by_scene.get(scene.id, []), external_refs=refs_by_scene.get(scene.id, []), playback_sources=pb_by_scene.get(scene.id, []), created_at=scene.created_at, last_played_at=progress.last_played_at if progress else None, finished=progress.finished if progress else False, position_sec=progress.position_sec if progress else 0, is_favorite=scene.id in fav_scene_ids, ) ) return out def _build_scene_out(session: Session, scene: Scene) -> SceneOut: studio_out: StudioOut | None = None if scene.studio_id is not None: st = session.get(Studio, scene.studio_id) if st is not None: studio_out = StudioOut.model_validate(st) performer_rows = session.execute( select(ScenePerformer, Performer) .join(Performer, Performer.id == ScenePerformer.performer_id) .where(ScenePerformer.scene_id == scene.id) .order_by(ScenePerformer.position.asc().nullslast()) ).all() performers_out: list[PerformerOut] = [] for sp, performer in performer_rows: performers_out.append( PerformerOut( id=performer.id, canonical_name=performer.canonical_name, slug=performer.slug, gender=performer.gender.value if performer.gender else None, as_alias=sp.as_alias, ) ) tag_rows = ( session.execute( select(Tag).join(SceneTag, SceneTag.tag_id == Tag.id).where(SceneTag.scene_id == scene.id) ) .scalars() .all() ) tags_out = [TagOut.model_validate(t) for t in tag_rows] ref_rows = session.execute( select(SceneExternalRef, Source) .join(Source, Source.id == SceneExternalRef.source_id) .where(SceneExternalRef.scene_id == scene.id) ).all() refs_out = [ ExternalRefOut( source=src.name, external_id=ref.external_id, url=ref.url, last_seen=ref.last_seen, ) for ref, src in ref_rows ] playback_rows = ( session.execute( select(PlaybackSource) .where( PlaybackSource.scene_id == scene.id, PlaybackSource.dead_at.is_(None), # ukryj martwe linki ) .order_by(PlaybackSource.origin.asc()) ) .scalars() .all() ) playback_out: list[PlaybackSourceOut] = [] for p in playback_rows: out = PlaybackSourceOut.model_validate(p) # Wrap thumbnail URL-e przez backend image proxy gdy CDN wymaga Refera # (hqporner — fastporndelivery zwraca 403 bez Referer headera, expo-image # nie wysyła go domyślnie). Token ma 30-dniowy TTL bo thumby są stabilne. if out.thumbnail_url and _needs_proxy(out.thumbnail_url): out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url) if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url): out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url) playback_out.append(out) progress = session.get(ScenePlayProgress, scene.id) is_fav = session.get(FavoriteScene, scene.id) is not None return SceneOut( id=scene.id, title=scene.title, slug=scene.slug, release_date=scene.release_date, duration_sec=scene.duration_sec, description=scene.description, code=scene.code, director=scene.director, studio=studio_out, performers=performers_out, tags=tags_out, external_refs=refs_out, playback_sources=playback_out, created_at=scene.created_at, last_played_at=progress.last_played_at if progress else None, finished=progress.finished if progress else False, position_sec=progress.position_sec if progress else 0, is_favorite=is_fav, ) @router.delete("/{scene_id}/tags/{tag_id}", status_code=status.HTTP_204_NO_CONTENT) def remove_tag_from_scene( scene_id: uuid.UUID, tag_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> None: """Usuwa relację scene↔tag (np. user uznał że tag jest błędny dla tej sceny). Idempotent: brak relacji = success. Nie kasuje samego Tag-a — inne sceny mogą z niego korzystać. Sam tag zostaje w słowniku tagów. """ rel = session.execute( select(SceneTag).where(SceneTag.scene_id == scene_id, SceneTag.tag_id == tag_id) ).scalar_one_or_none() if rel is None: return session.delete(rel) session.commit() @router.delete( "/{scene_id}/performers/{performer_id}", status_code=status.HTTP_204_NO_CONTENT ) def remove_performer_from_scene( scene_id: uuid.UUID, performer_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> None: """Usuwa relację scene↔performer (false-match dedup zostawił nie tą osobę). Idempotent. Sama Performer zostaje. Użyteczne np. gdy fuzzy match aliasu "Bella" wciągnął Anna Bella sceny pod Bad Bella, lub Miss Teela na xnxx została przypisana do scen w których jej nie ma (zgłoszenia 2026-05-10). """ from app.models.scene import ScenePerformer rel = session.execute( select(ScenePerformer).where( ScenePerformer.scene_id == scene_id, ScenePerformer.performer_id == performer_id, ) ).scalar_one_or_none() if rel is None: return session.delete(rel) session.commit() class EnrichTagsOut(BaseModel): scene_id: uuid.UUID added: int tube_used: str | None tags: list[str] @router.post("/{scene_id}/enrich-tags", response_model=EnrichTagsOut) def enrich_tags_from_tube( scene_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> EnrichTagsOut: """Pobiera page HTML z dowolnego tube playback_source dla tej sceny i scrape'uje tagi (categories/tags). Dodaje brakujące do scene_tags. Mobile wywołuje to przy otwarciu SceneDetail jeśli scena ma 0 tagów AND ma tube source z obsługiwanym extractorem (porntrex/youporn/xvideos/xnxx/redtube/ xhamster/eporner). Idempotent: ponowne wywołanie z tymi samymi tagami nic nie robi (UNIQUE PK scene_tags). Konkretne tube źródło wybierane wg priority listy (mainstream bardziej rzetelne niż aggregator). """ from app.extractors._fetch import browser_get from app.extractors._models import TubePageError from app.extractors.tag_extract import EXTRACTORS, extract_tags from app.models.playback_source import PlaybackSource from app.models.tag import Tag from app.normalize.scenes import NormalizedTag from app.normalize.text import slugify from app.resolve.tag_resolver import resolve_tag scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") # Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage). PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom", "xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"] sources = session.execute( select(PlaybackSource).where( PlaybackSource.scene_id == scene_id, PlaybackSource.dead_at.is_(None), ) ).scalars().all() # Wybierz pierwsze źródło wg priority listy które ma supported extractor chosen: PlaybackSource | None = None for tag in PRIORITY: for src in sources: if src.origin == f"tube:{tag}": chosen = src break if chosen: break if chosen is None: # Fallback: dowolne źródło z extractorem for src in sources: if src.origin.startswith("tube:"): sitetag = src.origin.split(":", 1)[1] if sitetag in EXTRACTORS: chosen = src break if chosen is None: return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=None, tags=[]) sitetag = chosen.origin.split(":", 1)[1] try: r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True) r.raise_for_status() except (TubePageError, Exception) as e: log.warning("enrich-tags fetch failed for %s: %s", chosen.page_url, e) return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[]) tag_names = extract_tags(sitetag, r.text) if not tag_names: return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[]) # Upsert: dla każdego taga utwórz/znajdź Tag, dorzuć SceneTag idempotentnie. # Używamy PostgreSQL INSERT ... ON CONFLICT DO NOTHING zamiast ORM session.add() # bo `resolve_tag` robi session.flush() w pętli, emitując pending SceneTag INSERT # z poprzednich iteracji — gdy 2 concurrent enrich-tags collide na tym samym # (scene_id, tag_id), drugi flush dostaje UniqueViolation (GOON-H, 4 events # w 10h mimo wcześniejszego seen_tag_ids fix). ON CONFLICT skip'uje silently. from sqlalchemy.dialects.postgresql import insert as pg_insert added = 0 seen_tag_ids: set = set() for name in tag_names: norm = NormalizedTag(name=name, slug=slugify(name), external_id=None) tag = resolve_tag(session, norm=norm) if tag is None or tag.id in seen_tag_ids: continue seen_tag_ids.add(tag.id) stmt = ( pg_insert(SceneTag.__table__) .values(scene_id=scene_id, tag_id=tag.id, source_id=None) .on_conflict_do_nothing(index_elements=["scene_id", "tag_id"]) ) result = session.execute(stmt) # rowcount == 1 gdy faktycznie wstawiony, 0 gdy ON CONFLICT skip if result.rowcount and result.rowcount > 0: added += 1 session.commit() return EnrichTagsOut(scene_id=scene_id, added=added, tube_used=sitetag, tags=tag_names) class EnrichDurationOut(BaseModel): scene_id: uuid.UUID duration_sec: int | None tube_used: str | None @router.post("/{scene_id}/enrich-duration", response_model=EnrichDurationOut) def enrich_duration_from_tube( scene_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> EnrichDurationOut: """Wyciąga duration z dowolnego tube playback_source — wszystkie znane tube'y udostępniają duration na detail page (og:video:duration lub LD-JSON ISO 8601). Mobile wywołuje to przy otwarciu SceneDetail gdy scene.duration_sec jest null AND ma tube source. Dla dedupu duration to najsilniejszy single signal — bez niego sceny z weak title-only score są capowane na 0.85 (review queue). Idempotent: zwraca aktualne duration_sec jeśli już ustawione. """ from app.extractors._fetch import browser_get from app.extractors._models import TubePageError from app.extractors.duration_extract import extract_duration_sec from app.models.playback_source import PlaybackSource scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") if scene.duration_sec is not None: return EnrichDurationOut( scene_id=scene_id, duration_sec=scene.duration_sec, tube_used=None ) sources = session.execute( select(PlaybackSource).where( PlaybackSource.scene_id == scene_id, PlaybackSource.dead_at.is_(None), PlaybackSource.origin.like("tube:%"), ) ).scalars().all() for src in sources: try: r = browser_get(src.page_url, timeout=15.0, follow_redirects=True) r.raise_for_status() except (TubePageError, Exception) as e: log.debug("enrich-duration fetch failed for %s: %s", src.page_url, e) continue d = extract_duration_sec(r.text) if d is not None and d > 0: scene.duration_sec = d # Zapisz też na poziomie playback_source dla parity (przyda się jeśli # potem dorobimy per-source duration mismatch detection). if src.duration_sec is None: src.duration_sec = d session.commit() return EnrichDurationOut( scene_id=scene_id, duration_sec=d, tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None, ) return EnrichDurationOut(scene_id=scene_id, duration_sec=None, tube_used=None) class EnrichStudioOut(BaseModel): scene_id: uuid.UUID studio_id: uuid.UUID | None studio_name: str | None tube_used: str | None @router.post("/{scene_id}/enrich-studio", response_model=EnrichStudioOut) def enrich_studio_from_tube( scene_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> EnrichStudioOut: """Wyciąga studio (DVD/series) z pornhat scene page'a. Pornhat ma `class="info-video js-ajax-dvd" data-setup='{"title": "Adult Time", ...}'` dla studio. Inne tube'y obsługiwane będą gdy znajdziemy ich pattern — na razie tylko pornhat (najczystsze studio metadata wśród free tubes). """ import json as _json from app.extractors._fetch import browser_get from app.extractors._models import TubePageError from app.models.playback_source import PlaybackSource from app.models.studio import Studio from app.normalize.text import slugify scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") if scene.studio_id is not None: existing = session.get(Studio, scene.studio_id) return EnrichStudioOut( scene_id=scene_id, studio_id=scene.studio_id, studio_name=existing.name if existing else None, tube_used=None, ) chosen = session.execute( select(PlaybackSource).where( PlaybackSource.scene_id == scene_id, PlaybackSource.dead_at.is_(None), PlaybackSource.origin == "tube:pornhatcom", ) ).scalars().first() if chosen is None: return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used=None) try: r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True) r.raise_for_status() except (TubePageError, Exception) as e: log.warning("enrich-studio fetch failed for %s: %s", chosen.page_url, e) return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom") m = re.search( r"class=\"info-video js-ajax-dvd[^\"]*\"[^>]*data-setup='([^']+)'", r.text, re.IGNORECASE, ) if m is None: return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom") try: data = _json.loads(m.group(1)) except _json.JSONDecodeError: return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom") name = (data.get("title") or "").strip() if not name: return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom") slug = (data.get("dir") or "").strip() or slugify(name) studio = session.execute( select(Studio).where(Studio.slug == slug) ).scalar_one_or_none() if studio is None: studio = session.execute( select(Studio).where(Studio.name == name) ).scalar_one_or_none() if studio is None: studio = Studio(name=name, slug=slug) session.add(studio) session.flush() scene.studio_id = studio.id session.commit() return EnrichStudioOut( scene_id=scene_id, studio_id=studio.id, studio_name=studio.name, tube_used="pornhatcom" ) class EnrichThumbOut(BaseModel): scene_id: uuid.UUID thumbnail_url: str | None tube_used: str | None sources_updated: int @router.post("/{scene_id}/enrich-thumbnail", response_model=EnrichThumbOut) def enrich_thumbnail_from_tube( scene_id: uuid.UUID, session: Annotated[Session, Depends(get_session)], ) -> EnrichThumbOut: """Pobiera detail page z dowolnego tube playback_source bez thumbnail_url i wyciąga miniaturkę (og:image / twitter:image / LD-JSON thumbnailUrl / KVS html5player). Update'uje WSZYSTKIE PlaybackSource'y dla tej sceny które nie mają thumb, żeby kolejne otwarcia listy widziały miniaturę niezależnie od source pick. Mobile auto-wywoła to przy otwarciu SceneDetail bez thumb (jak duration). """ from app.extractors._fetch import browser_get from app.extractors._models import TubePageError from app.extractors.thumb_extract import extract_thumbnail_url from app.models.playback_source import PlaybackSource scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") sources = session.execute( select(PlaybackSource).where( PlaybackSource.scene_id == scene_id, PlaybackSource.dead_at.is_(None), PlaybackSource.origin.like("tube:%"), ) ).scalars().all() sources_with_thumb = [s for s in sources if s.thumbnail_url] if sources_with_thumb: # już mamy — idempotent return. return EnrichThumbOut( scene_id=scene_id, thumbnail_url=sources_with_thumb[0].thumbnail_url, tube_used=None, sources_updated=0, ) for src in sources: try: r = browser_get(src.page_url, timeout=15.0, follow_redirects=True) r.raise_for_status() except (TubePageError, Exception) as e: log.debug("enrich-thumbnail fetch failed for %s: %s", src.page_url, e) continue thumb = extract_thumbnail_url(r.text) if thumb: # Zapisz na wszystkich źródłach bez thumb (oszczędza duplikat fetch) updated = 0 for s in sources: if not s.thumbnail_url: s.thumbnail_url = thumb updated += 1 session.commit() return EnrichThumbOut( scene_id=scene_id, thumbnail_url=thumb, tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None, sources_updated=updated, ) return EnrichThumbOut( scene_id=scene_id, thumbnail_url=None, tube_used=None, sources_updated=0 )