"""Repair tytułów uciętych po wewnętrznym apostrofie. Bug w `meta_content` regex (fixed 2026-05-20): `[^"\']*` tnął content po wewnętrznym apostrofie → `She's So Insatiable` → `She`. Skrypt re-fetches detail page dla scen z tube:freshpornoorg/porn00org/pornxpph z podejrzanie krótkim tytułem i updateuje jeśli og:title (po fix) jest dłuższy. Uruchomienie: docker exec goon-worker-1 python -m scripts.repair_truncated_titles [--origin tube:freshpornoorg] [--limit 1000] """ from __future__ import annotations import argparse import logging import time import sqlalchemy as sa from app.connectors.direct_scrapers._browse_base import meta_content from app.db import session_scope from app.extractors._fetch import browser_get log = logging.getLogger(__name__) def main() -> int: p = argparse.ArgumentParser() p.add_argument("--origin", default="tube:freshpornoorg") p.add_argument("--limit", type=int, default=5000) p.add_argument("--throttle", type=float, default=0.3) args = p.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") with session_scope() as session: rows = session.execute( sa.text( "SELECT s.id, s.title, ps.page_url " "FROM scenes s JOIN playback_sources ps ON ps.scene_id = s.id " "WHERE ps.origin = :origin AND ps.dead_at IS NULL " "AND LENGTH(s.title) < 25 " "LIMIT :limit" ), {"origin": args.origin, "limit": args.limit}, ).all() log.info("repair titles: %d candidates for %s", len(rows), args.origin) counters = {"checked": 0, "updated": 0, "skipped_404": 0, "skipped_same": 0} for i, (scene_id, old_title, page_url) in enumerate(rows, 1): counters["checked"] += 1 try: r = browser_get(page_url, timeout=15.0, follow_redirects=True) except Exception as e: log.debug("fetch fail %s: %s", page_url, e) continue if r.status_code in (404, 410): counters["skipped_404"] += 1 continue if r.status_code >= 400: continue new_title = meta_content(r.text, property="og:title") if not new_title: continue # Only update if longer (i.e. parser found full title after fix) if len(new_title) <= len(old_title): counters["skipped_same"] += 1 continue with session_scope() as session: session.execute( sa.text( "UPDATE scenes SET title = :t, title_normalized = LOWER(:t) WHERE id = :sid" ), {"t": new_title.strip(), "sid": scene_id}, ) counters["updated"] += 1 if i % 25 == 0: log.info("progress %d/%d %s", i, len(rows), counters) time.sleep(args.throttle) log.info("done: %s", counters) return 0 if __name__ == "__main__": raise SystemExit(main())