"""Bulk thumbnail backfill dla scen bez miniatury. Pętla: dla każdej sceny z `dead_at IS NULL` playback_sources i 0 z thumbnail_url, fetch tube page (pierwszy alive playback) → extract_thumbnail_url() → update WSZYSTKIE alive sources tej sceny (idempotent). Bug-reports 2026-05-10 (b2a656fe, 3876a8ce): user "Brak miniaturek" na Scenes listingu. Mobile używa `playback_sources.find(s.thumbnail_url)` — bez backfill ~133k scen pokazuje placeholder. Run: `python /srv/scripts/backfill_scene_thumbnails.py --batch 100 --limit 50000` """ from __future__ import annotations import argparse import logging import sys import time import httpx from sqlalchemy import select, func from sqlalchemy.orm import Session sys.path.insert(0, "/srv") from app.db import SessionLocal from app.extractors._fetch import browser_get from app.extractors._models import TubePageError from app.extractors.thumb_extract import extract_thumbnail_url from app.models.playback_source import PlaybackSource log = logging.getLogger("backfill_thumbnails") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) # Narrow exception set — wcześniej `(TubePageError, Exception)` łapało wszystko # włącznie z KeyboardInterrupt + MemoryError + maskowało connection pool failures # jak transient retry'owalne. NET_EXC = (TubePageError, httpx.HTTPError, OSError, ValueError) def _scenes_missing_thumb(session: Session, batch: int, after_id, origin: str | None = None) -> list[tuple]: """Cursor-based pagination po scene_id > after_id. NIE używamy offset (skip-rows bug: gdy scena w batch'u N nie zostanie przeprocesowana, offset += batch zignoruje ją w batchu N+1). Cursor by scene_id gwarantuje monotonic forward progress: niezależnie czy update się udał czy failem zostawił thumbnail=NULL, następny batch zaczyna od scene_id > MAX(processed_in_this_batch). Failed scenes są poza zasięgiem do nextrun. """ q = ( select( PlaybackSource.scene_id, func.array_agg(PlaybackSource.id).label("ps_ids"), func.array_agg(PlaybackSource.page_url).label("page_urls"), func.array_agg(PlaybackSource.origin).label("origins"), ) .where( PlaybackSource.dead_at.is_(None), PlaybackSource.origin == origin if origin else PlaybackSource.origin.like("tube:%"), ) .group_by(PlaybackSource.scene_id) .having(func.bool_and(PlaybackSource.thumbnail_url.is_(None))) .order_by(PlaybackSource.scene_id) .limit(batch) ) if after_id is not None: q = q.where(PlaybackSource.scene_id > after_id) rows = session.execute(q).all() return [ (r.scene_id, list(zip(r.ps_ids, r.page_urls, r.origins, strict=False))) for r in rows ] def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--batch", type=int, default=100, help="scenes per DB query batch") ap.add_argument("--limit", type=int, default=10_000, help="max scenes to attempt") ap.add_argument("--sleep", type=float, default=0.5, help="seconds between fetches") ap.add_argument("--origin", type=str, default=None, help="filter by exact origin (e.g. tube:0dayxxcom)") args = ap.parse_args() total = ok = fail = 0 cursor = None while total < args.limit: with SessionLocal() as session: batch = _scenes_missing_thumb(session, batch=args.batch, after_id=cursor, origin=args.origin) if not batch: log.info("no more scenes to process (cursor=%s, done)", cursor) break # Advance cursor PRZED przetwarzaniem — w razie crash następny run # zacznie za tym batchem, nie powtórzy go w pętli infinite. cursor = batch[-1][0] for scene_id, sources in batch: total += 1 sp = session.begin_nested() try: thumb = None update_src_id = None for ps_id, page_url, origin in sources: if not page_url: continue try: r = browser_get(page_url, timeout=10.0, follow_redirects=True) except NET_EXC as e: log.debug("fetch fail %s: %s", page_url, e) continue if r.status_code >= 400: continue thumb = extract_thumbnail_url(r.text) if thumb: update_src_id = ps_id break time.sleep(args.sleep) if not thumb or update_src_id is None: sp.rollback() fail += 1 continue # Update tylko źródła z którego thumb pochodzi (single playback). session.execute( PlaybackSource.__table__.update() .where(PlaybackSource.id == update_src_id) .where(PlaybackSource.thumbnail_url.is_(None)) .values(thumbnail_url=thumb) ) sp.commit() ok += 1 except Exception as e: sp.rollback() log.warning("scene %s unexpected: %s", scene_id, e) fail += 1 if total % 50 == 0: session.commit() log.info( "progress total=%d ok=%d fail=%d cursor=%s", total, ok, fail, cursor, ) session.commit() log.info("done total=%d ok=%d fail=%d", total, ok, fail) if __name__ == "__main__": main()