goon/scripts/studio_retrofix.py
jtrzupek a196fcbcdb refactor(ingest): rename scraper Source name "pornapp" -> "tube-scraper"
The umbrella Source.name for all direct tube scrapers (deep-crawl, browse-latest,
performer-driven) was "pornapp" — a misleading leftover from the removed external
porn-app API. It read like a dependency on a third-party "pornapp" service; it is
not — these are our own scrapers hitting 25+ tubes directly (kind=scraper,
origin tube:<sitetag>). Renamed to "tube-scraper" via a single SCRAPER_SOURCE_NAME
constant; DB row renamed in place (UPDATE name, same id) so all ingest_runs +
external_records history stays linked. No behavior change — external_id keying
(sitetag:url) and dedup are unaffected.

NOTE: playback_sources.origin "pornapp:<sitetag>" prefix is a separate legacy
format (resolve_playback parses it) and is intentionally left untouched.

Verified on prod: row renamed (0 stray "pornapp"), new runs land on "tube-scraper".

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 16:54:55 +02:00

167 lines
6.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Retro-fix studio_id dla istniejących pornapp scen z aggregator-WordPress source'ów.
Cel: skleić orphan factories (porndish, xmoviesforyou, watchporn, hdporn92) z canonical
scenami w TPDB/StashDB. Te tube'y mają w tytule `[Studio]` lub `Studio Perf Title`
ale `studios.name` jest ustawione na nazwę source'a (`PornDish`, `Watch.Porn`).
Pipeline per scena:
1. Parse `[Studio]` lub `Studio ...` z `scene.title`
2. Lookup canonical studio w `studios` table (po slugify name match — czas też
prosty substring fallback)
3. Jeśli match → update scene.studio_id (commit per-batch)
4. Po update, scena MAY auto-merge przy następnym ingest run gdy resolver path 4
blocking po studio+date znajdzie kandydatów
**Usage:**
docker compose exec -T worker python scripts/studio_retrofix.py --dry-run
docker compose exec -T worker python scripts/studio_retrofix.py --commit
**ETA:** ~10 min dla ~67k scen (porndish 4.5k + xmoviesforyou 1.8k + watchporn 4k +
hdporn92 31k = ~41k; reszta to słabo parsujące się). Read-only DB do dry-run.
"""
from __future__ import annotations
import argparse
import logging
import re
import sys
from sqlalchemy import text
from app.db import session_scope
from app.normalize.text import slugify
from app.resolve.studio_title_parser import parse_title
log = logging.getLogger("studio_retrofix")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
# Sitetagi z parseable title formats. Reszta pornapp scen (xhamster/xvideos/etc.)
# nie ma `[Studio]` w tytule.
TARGET_SITETAGS = ["porndishcom", "xmoviesforyoucom", "watchporn", "hdporn92com"]
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--commit", action="store_true", help="Zapisz zmiany (default: dry-run)")
ap.add_argument("--limit", type=int, default=None, help="Limit scen (debug)")
ap.add_argument("--sitetags", nargs="+", default=TARGET_SITETAGS)
args = ap.parse_args()
# Load studios index (slug → studio_id, name) once
log.info("loading studios index...")
with session_scope() as session:
rows = session.execute(text("SELECT id, name FROM studios")).all()
studio_by_slug: dict[str, tuple[str, str]] = {}
for r in rows:
sl = slugify(r.name)
studio_by_slug[sl] = (str(r.id), r.name)
log.info(" %d studios loaded", len(studio_by_slug))
# Find candidate scenes per sitetag
stats = {
"scanned": 0,
"no_parse": 0,
"studio_no_canonical": 0,
"already_correct": 0,
"would_fix_studio": 0,
"would_fix_date": 0,
"fixed_studio": 0,
"fixed_date": 0,
}
no_match_studios: dict[str, int] = {} # parsed studio name → count
for sitetag in args.sitetags:
log.info("--- %s ---", sitetag)
with session_scope() as session:
rows = session.execute(
text("""
SELECT sc.id, sc.title, sc.studio_id, sc.release_date, st.name AS curr_studio
FROM scene_external_refs ser
JOIN scenes sc ON sc.id = ser.scene_id
JOIN sources s ON s.id = ser.source_id
LEFT JOIN studios st ON st.id = sc.studio_id
WHERE s.name = 'tube-scraper'
AND ser.external_id LIKE :prefix
ORDER BY sc.id
LIMIT :lim
"""),
{"prefix": f"{sitetag}:%", "lim": args.limit or 10_000_000},
).all()
log.info(" candidates: %d scenes", len(rows))
# Process in batches of 500
BATCH = 500
for i in range(0, len(rows), BATCH):
batch = rows[i : i + BATCH]
# (scene_id, studio_id_or_None, release_date_or_None)
updates: list[tuple[str, str | None, str | None]] = []
for row in batch:
stats["scanned"] += 1
parsed = parse_title(row.title)
if parsed.studio is None:
stats["no_parse"] += 1
continue
parsed_slug = slugify(parsed.studio)
target = studio_by_slug.get(parsed_slug)
if not target:
fallback = re.sub(r'(?:ll?|hd|xxx|com|tv|tube|video|videos)$', '',
parsed_slug)
target = studio_by_slug.get(fallback) if fallback != parsed_slug else None
if not target:
stats["studio_no_canonical"] += 1
no_match_studios[parsed.studio] = no_match_studios.get(parsed.studio, 0) + 1
continue
target_id, target_name = target
new_studio_id = None if str(row.studio_id) == target_id else target_id
new_date = (
parsed.release_date.isoformat()
if parsed.release_date is not None and row.release_date is None
else None
)
if new_studio_id is None and new_date is None:
stats["already_correct"] += 1
continue
if new_studio_id:
stats["would_fix_studio"] += 1
if new_date:
stats["would_fix_date"] += 1
updates.append((str(row.id), new_studio_id, new_date))
if args.commit and updates:
with session_scope() as session:
for scene_id, studio_id, rel_date in updates:
sets = []
params: dict = {"id": scene_id}
if studio_id:
sets.append("studio_id = :sid")
params["sid"] = studio_id
stats["fixed_studio"] += 1
if rel_date:
sets.append("release_date = :rd")
params["rd"] = rel_date
stats["fixed_date"] += 1
sets.append("updated_at = NOW()")
session.execute(
text(f"UPDATE scenes SET {', '.join(sets)} WHERE id = :id"),
params,
)
log.info(" batch %d/%d: committed %d updates",
i // BATCH + 1, (len(rows) + BATCH - 1) // BATCH, len(updates))
log.info("===== STATS =====")
for k, v in stats.items():
log.info(" %s: %d", k, v)
# Top 30 unparsed studios (na case'y do dodania do studios table jak TPDB nie dostarczy)
log.info("===== Top 30 parsed studios NOT in canonical (parse OK, lookup miss) =====")
for studio, n in sorted(no_match_studios.items(), key=lambda x: -x[1])[:30]:
log.info(" %s: %d scenes", studio, n)
log.info("dry_run=%s. Use --commit to apply.", not args.commit)
return 0
if __name__ == "__main__":
sys.exit(main())