Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
153 lines
5.8 KiB
Python
153 lines
5.8 KiB
Python
"""Bulk thumbnail backfill dla scen bez miniatury.
|
|
|
|
Pętla: dla każdej sceny z `dead_at IS NULL` playback_sources i 0 z thumbnail_url,
|
|
fetch tube page (pierwszy alive playback) → extract_thumbnail_url() → update
|
|
WSZYSTKIE alive sources tej sceny (idempotent).
|
|
|
|
Bug-reports 2026-05-10 (b2a656fe, 3876a8ce): user "Brak miniaturek" na Scenes
|
|
listingu. Mobile używa `playback_sources.find(s.thumbnail_url)` — bez backfill
|
|
~133k scen pokazuje placeholder.
|
|
|
|
Run: `python /srv/scripts/backfill_scene_thumbnails.py --batch 100 --limit 50000`
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
import time
|
|
|
|
import httpx
|
|
from sqlalchemy import select, func
|
|
from sqlalchemy.orm import Session
|
|
|
|
sys.path.insert(0, "/srv")
|
|
from app.db import SessionLocal
|
|
from app.extractors._fetch import browser_get
|
|
from app.extractors._models import TubePageError
|
|
from app.extractors.thumb_extract import extract_thumbnail_url
|
|
from app.models.playback_source import PlaybackSource
|
|
|
|
log = logging.getLogger("backfill_thumbnails")
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
)
|
|
|
|
# Narrow exception set — wcześniej `(TubePageError, Exception)` łapało wszystko
|
|
# włącznie z KeyboardInterrupt + MemoryError + maskowało connection pool failures
|
|
# jak transient retry'owalne.
|
|
NET_EXC = (TubePageError, httpx.HTTPError, OSError, ValueError)
|
|
|
|
|
|
def _scenes_missing_thumb(session: Session, batch: int, after_id, origin: str | None = None) -> list[tuple]:
|
|
"""Cursor-based pagination po scene_id > after_id.
|
|
|
|
NIE używamy offset (skip-rows bug: gdy scena w batch'u N nie zostanie
|
|
przeprocesowana, offset += batch zignoruje ją w batchu N+1). Cursor by
|
|
scene_id gwarantuje monotonic forward progress: niezależnie czy update
|
|
się udał czy failem zostawił thumbnail=NULL, następny batch zaczyna od
|
|
scene_id > MAX(processed_in_this_batch). Failed scenes są poza
|
|
zasięgiem do nextrun.
|
|
"""
|
|
q = (
|
|
select(
|
|
PlaybackSource.scene_id,
|
|
func.array_agg(PlaybackSource.id).label("ps_ids"),
|
|
func.array_agg(PlaybackSource.page_url).label("page_urls"),
|
|
func.array_agg(PlaybackSource.origin).label("origins"),
|
|
)
|
|
.where(
|
|
PlaybackSource.dead_at.is_(None),
|
|
PlaybackSource.origin == origin if origin else PlaybackSource.origin.like("tube:%"),
|
|
)
|
|
.group_by(PlaybackSource.scene_id)
|
|
.having(func.bool_and(PlaybackSource.thumbnail_url.is_(None)))
|
|
.order_by(PlaybackSource.scene_id)
|
|
.limit(batch)
|
|
)
|
|
if after_id is not None:
|
|
q = q.where(PlaybackSource.scene_id > after_id)
|
|
rows = session.execute(q).all()
|
|
return [
|
|
(r.scene_id, list(zip(r.ps_ids, r.page_urls, r.origins, strict=False)))
|
|
for r in rows
|
|
]
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--batch", type=int, default=100, help="scenes per DB query batch")
|
|
ap.add_argument("--limit", type=int, default=10_000, help="max scenes to attempt")
|
|
ap.add_argument("--sleep", type=float, default=0.5, help="seconds between fetches")
|
|
ap.add_argument("--origin", type=str, default=None, help="filter by exact origin (e.g. tube:0dayxxcom)")
|
|
args = ap.parse_args()
|
|
|
|
total = ok = fail = 0
|
|
cursor = None
|
|
|
|
while total < args.limit:
|
|
with SessionLocal() as session:
|
|
batch = _scenes_missing_thumb(session, batch=args.batch, after_id=cursor, origin=args.origin)
|
|
if not batch:
|
|
log.info("no more scenes to process (cursor=%s, done)", cursor)
|
|
break
|
|
# Advance cursor PRZED przetwarzaniem — w razie crash następny run
|
|
# zacznie za tym batchem, nie powtórzy go w pętli infinite.
|
|
cursor = batch[-1][0]
|
|
|
|
for scene_id, sources in batch:
|
|
total += 1
|
|
sp = session.begin_nested()
|
|
try:
|
|
thumb = None
|
|
update_src_id = None
|
|
for ps_id, page_url, origin in sources:
|
|
if not page_url:
|
|
continue
|
|
try:
|
|
r = browser_get(page_url, timeout=10.0, follow_redirects=True)
|
|
except NET_EXC as e:
|
|
log.debug("fetch fail %s: %s", page_url, e)
|
|
continue
|
|
if r.status_code >= 400:
|
|
continue
|
|
thumb = extract_thumbnail_url(r.text)
|
|
if thumb:
|
|
update_src_id = ps_id
|
|
break
|
|
time.sleep(args.sleep)
|
|
|
|
if not thumb or update_src_id is None:
|
|
sp.rollback()
|
|
fail += 1
|
|
continue
|
|
|
|
# Update tylko źródła z którego thumb pochodzi (single playback).
|
|
session.execute(
|
|
PlaybackSource.__table__.update()
|
|
.where(PlaybackSource.id == update_src_id)
|
|
.where(PlaybackSource.thumbnail_url.is_(None))
|
|
.values(thumbnail_url=thumb)
|
|
)
|
|
sp.commit()
|
|
ok += 1
|
|
except Exception as e:
|
|
sp.rollback()
|
|
log.warning("scene %s unexpected: %s", scene_id, e)
|
|
fail += 1
|
|
|
|
if total % 50 == 0:
|
|
session.commit()
|
|
log.info(
|
|
"progress total=%d ok=%d fail=%d cursor=%s",
|
|
total, ok, fail, cursor,
|
|
)
|
|
|
|
session.commit()
|
|
|
|
log.info("done total=%d ok=%d fail=%d", total, ok, fail)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|