goon/scripts/backfill_scene_thumbnails.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

153 lines
5.8 KiB
Python

"""Bulk thumbnail backfill dla scen bez miniatury.
Pętla: dla każdej sceny z `dead_at IS NULL` playback_sources i 0 z thumbnail_url,
fetch tube page (pierwszy alive playback) → extract_thumbnail_url() → update
WSZYSTKIE alive sources tej sceny (idempotent).
Bug-reports 2026-05-10 (b2a656fe, 3876a8ce): user "Brak miniaturek" na Scenes
listingu. Mobile używa `playback_sources.find(s.thumbnail_url)` — bez backfill
~133k scen pokazuje placeholder.
Run: `python /srv/scripts/backfill_scene_thumbnails.py --batch 100 --limit 50000`
"""
from __future__ import annotations
import argparse
import logging
import sys
import time
import httpx
from sqlalchemy import select, func
from sqlalchemy.orm import Session
sys.path.insert(0, "/srv")
from app.db import SessionLocal
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.thumb_extract import extract_thumbnail_url
from app.models.playback_source import PlaybackSource
log = logging.getLogger("backfill_thumbnails")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
# Narrow exception set — wcześniej `(TubePageError, Exception)` łapało wszystko
# włącznie z KeyboardInterrupt + MemoryError + maskowało connection pool failures
# jak transient retry'owalne.
NET_EXC = (TubePageError, httpx.HTTPError, OSError, ValueError)
def _scenes_missing_thumb(session: Session, batch: int, after_id, origin: str | None = None) -> list[tuple]:
"""Cursor-based pagination po scene_id > after_id.
NIE używamy offset (skip-rows bug: gdy scena w batch'u N nie zostanie
przeprocesowana, offset += batch zignoruje ją w batchu N+1). Cursor by
scene_id gwarantuje monotonic forward progress: niezależnie czy update
się udał czy failem zostawił thumbnail=NULL, następny batch zaczyna od
scene_id > MAX(processed_in_this_batch). Failed scenes są poza
zasięgiem do nextrun.
"""
q = (
select(
PlaybackSource.scene_id,
func.array_agg(PlaybackSource.id).label("ps_ids"),
func.array_agg(PlaybackSource.page_url).label("page_urls"),
func.array_agg(PlaybackSource.origin).label("origins"),
)
.where(
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin == origin if origin else PlaybackSource.origin.like("tube:%"),
)
.group_by(PlaybackSource.scene_id)
.having(func.bool_and(PlaybackSource.thumbnail_url.is_(None)))
.order_by(PlaybackSource.scene_id)
.limit(batch)
)
if after_id is not None:
q = q.where(PlaybackSource.scene_id > after_id)
rows = session.execute(q).all()
return [
(r.scene_id, list(zip(r.ps_ids, r.page_urls, r.origins, strict=False)))
for r in rows
]
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--batch", type=int, default=100, help="scenes per DB query batch")
ap.add_argument("--limit", type=int, default=10_000, help="max scenes to attempt")
ap.add_argument("--sleep", type=float, default=0.5, help="seconds between fetches")
ap.add_argument("--origin", type=str, default=None, help="filter by exact origin (e.g. tube:0dayxxcom)")
args = ap.parse_args()
total = ok = fail = 0
cursor = None
while total < args.limit:
with SessionLocal() as session:
batch = _scenes_missing_thumb(session, batch=args.batch, after_id=cursor, origin=args.origin)
if not batch:
log.info("no more scenes to process (cursor=%s, done)", cursor)
break
# Advance cursor PRZED przetwarzaniem — w razie crash następny run
# zacznie za tym batchem, nie powtórzy go w pętli infinite.
cursor = batch[-1][0]
for scene_id, sources in batch:
total += 1
sp = session.begin_nested()
try:
thumb = None
update_src_id = None
for ps_id, page_url, origin in sources:
if not page_url:
continue
try:
r = browser_get(page_url, timeout=10.0, follow_redirects=True)
except NET_EXC as e:
log.debug("fetch fail %s: %s", page_url, e)
continue
if r.status_code >= 400:
continue
thumb = extract_thumbnail_url(r.text)
if thumb:
update_src_id = ps_id
break
time.sleep(args.sleep)
if not thumb or update_src_id is None:
sp.rollback()
fail += 1
continue
# Update tylko źródła z którego thumb pochodzi (single playback).
session.execute(
PlaybackSource.__table__.update()
.where(PlaybackSource.id == update_src_id)
.where(PlaybackSource.thumbnail_url.is_(None))
.values(thumbnail_url=thumb)
)
sp.commit()
ok += 1
except Exception as e:
sp.rollback()
log.warning("scene %s unexpected: %s", scene_id, e)
fail += 1
if total % 50 == 0:
session.commit()
log.info(
"progress total=%d ok=%d fail=%d cursor=%s",
total, ok, fail, cursor,
)
session.commit()
log.info("done total=%d ok=%d fail=%d", total, ok, fail)
if __name__ == "__main__":
main()