"""Retro-fix studio_id dla istniejących pornapp scen z aggregator-WordPress source'ów. Cel: skleić orphan factories (porndish, xmoviesforyou, watchporn, hdporn92) z canonical scenami w TPDB/StashDB. Te tube'y mają w tytule `[Studio]` lub `Studio – Perf – Title` ale `studios.name` jest ustawione na nazwę source'a (`PornDish`, `Watch.Porn`). Pipeline per scena: 1. Parse `[Studio]` lub `Studio – ...` z `scene.title` 2. Lookup canonical studio w `studios` table (po slugify name match — czas też prosty substring fallback) 3. Jeśli match → update scene.studio_id (commit per-batch) 4. Po update, scena MAY auto-merge przy następnym ingest run gdy resolver path 4 blocking po studio+date znajdzie kandydatów **Usage:** docker compose exec -T worker python scripts/studio_retrofix.py --dry-run docker compose exec -T worker python scripts/studio_retrofix.py --commit **ETA:** ~10 min dla ~67k scen (porndish 4.5k + xmoviesforyou 1.8k + watchporn 4k + hdporn92 31k = ~41k; reszta to słabo parsujące się). Read-only DB do dry-run. """ from __future__ import annotations import argparse import logging import re import sys from sqlalchemy import text from app.db import session_scope from app.normalize.text import slugify from app.resolve.studio_title_parser import parse_title log = logging.getLogger("studio_retrofix") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") # Sitetagi z parseable title formats. Reszta pornapp scen (xhamster/xvideos/etc.) # nie ma `[Studio]` w tytule. TARGET_SITETAGS = ["porndishcom", "xmoviesforyoucom", "watchporn", "hdporn92com"] def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--commit", action="store_true", help="Zapisz zmiany (default: dry-run)") ap.add_argument("--limit", type=int, default=None, help="Limit scen (debug)") ap.add_argument("--sitetags", nargs="+", default=TARGET_SITETAGS) args = ap.parse_args() # Load studios index (slug → studio_id, name) once log.info("loading studios index...") with session_scope() as session: rows = session.execute(text("SELECT id, name FROM studios")).all() studio_by_slug: dict[str, tuple[str, str]] = {} for r in rows: sl = slugify(r.name) studio_by_slug[sl] = (str(r.id), r.name) log.info(" %d studios loaded", len(studio_by_slug)) # Find candidate scenes per sitetag stats = { "scanned": 0, "no_parse": 0, "studio_no_canonical": 0, "already_correct": 0, "would_fix_studio": 0, "would_fix_date": 0, "fixed_studio": 0, "fixed_date": 0, } no_match_studios: dict[str, int] = {} # parsed studio name → count for sitetag in args.sitetags: log.info("--- %s ---", sitetag) with session_scope() as session: rows = session.execute( text(""" SELECT sc.id, sc.title, sc.studio_id, sc.release_date, st.name AS curr_studio FROM scene_external_refs ser JOIN scenes sc ON sc.id = ser.scene_id JOIN sources s ON s.id = ser.source_id LEFT JOIN studios st ON st.id = sc.studio_id WHERE s.name = 'tube-scraper' AND ser.external_id LIKE :prefix ORDER BY sc.id LIMIT :lim """), {"prefix": f"{sitetag}:%", "lim": args.limit or 10_000_000}, ).all() log.info(" candidates: %d scenes", len(rows)) # Process in batches of 500 BATCH = 500 for i in range(0, len(rows), BATCH): batch = rows[i : i + BATCH] # (scene_id, studio_id_or_None, release_date_or_None) updates: list[tuple[str, str | None, str | None]] = [] for row in batch: stats["scanned"] += 1 parsed = parse_title(row.title) if parsed.studio is None: stats["no_parse"] += 1 continue parsed_slug = slugify(parsed.studio) target = studio_by_slug.get(parsed_slug) if not target: fallback = re.sub(r'(?:ll?|hd|xxx|com|tv|tube|video|videos)$', '', parsed_slug) target = studio_by_slug.get(fallback) if fallback != parsed_slug else None if not target: stats["studio_no_canonical"] += 1 no_match_studios[parsed.studio] = no_match_studios.get(parsed.studio, 0) + 1 continue target_id, target_name = target new_studio_id = None if str(row.studio_id) == target_id else target_id new_date = ( parsed.release_date.isoformat() if parsed.release_date is not None and row.release_date is None else None ) if new_studio_id is None and new_date is None: stats["already_correct"] += 1 continue if new_studio_id: stats["would_fix_studio"] += 1 if new_date: stats["would_fix_date"] += 1 updates.append((str(row.id), new_studio_id, new_date)) if args.commit and updates: with session_scope() as session: for scene_id, studio_id, rel_date in updates: sets = [] params: dict = {"id": scene_id} if studio_id: sets.append("studio_id = :sid") params["sid"] = studio_id stats["fixed_studio"] += 1 if rel_date: sets.append("release_date = :rd") params["rd"] = rel_date stats["fixed_date"] += 1 sets.append("updated_at = NOW()") session.execute( text(f"UPDATE scenes SET {', '.join(sets)} WHERE id = :id"), params, ) log.info(" batch %d/%d: committed %d updates", i // BATCH + 1, (len(rows) + BATCH - 1) // BATCH, len(updates)) log.info("===== STATS =====") for k, v in stats.items(): log.info(" %s: %d", k, v) # Top 30 unparsed studios (na case'y do dodania do studios table jak TPDB nie dostarczy) log.info("===== Top 30 parsed studios NOT in canonical (parse OK, lookup miss) =====") for studio, n in sorted(no_match_studios.items(), key=lambda x: -x[1])[:30]: log.info(" %s: %d scenes", studio, n) log.info("dry_run=%s. Use --commit to apply.", not args.commit) return 0 if __name__ == "__main__": sys.exit(main())