Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
167 lines
6.9 KiB
Python
167 lines
6.9 KiB
Python
"""Retro-fix studio_id dla istniejących pornapp scen z aggregator-WordPress source'ów.
|
||
|
||
Cel: skleić orphan factories (porndish, xmoviesforyou, watchporn, hdporn92) z canonical
|
||
scenami w TPDB/StashDB. Te tube'y mają w tytule `[Studio]` lub `Studio – Perf – Title`
|
||
ale `studios.name` jest ustawione na nazwę source'a (`PornDish`, `Watch.Porn`).
|
||
|
||
Pipeline per scena:
|
||
1. Parse `[Studio]` lub `Studio – ...` z `scene.title`
|
||
2. Lookup canonical studio w `studios` table (po slugify name match — czas też
|
||
prosty substring fallback)
|
||
3. Jeśli match → update scene.studio_id (commit per-batch)
|
||
4. Po update, scena MAY auto-merge przy następnym ingest run gdy resolver path 4
|
||
blocking po studio+date znajdzie kandydatów
|
||
|
||
**Usage:**
|
||
docker compose exec -T worker python scripts/studio_retrofix.py --dry-run
|
||
docker compose exec -T worker python scripts/studio_retrofix.py --commit
|
||
|
||
**ETA:** ~10 min dla ~67k scen (porndish 4.5k + xmoviesforyou 1.8k + watchporn 4k +
|
||
hdporn92 31k = ~41k; reszta to słabo parsujące się). Read-only DB do dry-run.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import logging
|
||
import re
|
||
import sys
|
||
|
||
from sqlalchemy import text
|
||
|
||
from app.db import session_scope
|
||
from app.normalize.text import slugify
|
||
from app.resolve.studio_title_parser import parse_title
|
||
|
||
log = logging.getLogger("studio_retrofix")
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||
|
||
# Sitetagi z parseable title formats. Reszta pornapp scen (xhamster/xvideos/etc.)
|
||
# nie ma `[Studio]` w tytule.
|
||
TARGET_SITETAGS = ["porndishcom", "xmoviesforyoucom", "watchporn", "hdporn92com"]
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--commit", action="store_true", help="Zapisz zmiany (default: dry-run)")
|
||
ap.add_argument("--limit", type=int, default=None, help="Limit scen (debug)")
|
||
ap.add_argument("--sitetags", nargs="+", default=TARGET_SITETAGS)
|
||
args = ap.parse_args()
|
||
|
||
# Load studios index (slug → studio_id, name) once
|
||
log.info("loading studios index...")
|
||
with session_scope() as session:
|
||
rows = session.execute(text("SELECT id, name FROM studios")).all()
|
||
studio_by_slug: dict[str, tuple[str, str]] = {}
|
||
for r in rows:
|
||
sl = slugify(r.name)
|
||
studio_by_slug[sl] = (str(r.id), r.name)
|
||
log.info(" %d studios loaded", len(studio_by_slug))
|
||
|
||
# Find candidate scenes per sitetag
|
||
stats = {
|
||
"scanned": 0,
|
||
"no_parse": 0,
|
||
"studio_no_canonical": 0,
|
||
"already_correct": 0,
|
||
"would_fix_studio": 0,
|
||
"would_fix_date": 0,
|
||
"fixed_studio": 0,
|
||
"fixed_date": 0,
|
||
}
|
||
no_match_studios: dict[str, int] = {} # parsed studio name → count
|
||
|
||
for sitetag in args.sitetags:
|
||
log.info("--- %s ---", sitetag)
|
||
with session_scope() as session:
|
||
rows = session.execute(
|
||
text("""
|
||
SELECT sc.id, sc.title, sc.studio_id, sc.release_date, st.name AS curr_studio
|
||
FROM scene_external_refs ser
|
||
JOIN scenes sc ON sc.id = ser.scene_id
|
||
JOIN sources s ON s.id = ser.source_id
|
||
LEFT JOIN studios st ON st.id = sc.studio_id
|
||
WHERE s.name = 'pornapp'
|
||
AND ser.external_id LIKE :prefix
|
||
ORDER BY sc.id
|
||
LIMIT :lim
|
||
"""),
|
||
{"prefix": f"{sitetag}:%", "lim": args.limit or 10_000_000},
|
||
).all()
|
||
log.info(" candidates: %d scenes", len(rows))
|
||
|
||
# Process in batches of 500
|
||
BATCH = 500
|
||
for i in range(0, len(rows), BATCH):
|
||
batch = rows[i : i + BATCH]
|
||
# (scene_id, studio_id_or_None, release_date_or_None)
|
||
updates: list[tuple[str, str | None, str | None]] = []
|
||
for row in batch:
|
||
stats["scanned"] += 1
|
||
parsed = parse_title(row.title)
|
||
if parsed.studio is None:
|
||
stats["no_parse"] += 1
|
||
continue
|
||
parsed_slug = slugify(parsed.studio)
|
||
target = studio_by_slug.get(parsed_slug)
|
||
if not target:
|
||
fallback = re.sub(r'(?:ll?|hd|xxx|com|tv|tube|video|videos)$', '',
|
||
parsed_slug)
|
||
target = studio_by_slug.get(fallback) if fallback != parsed_slug else None
|
||
if not target:
|
||
stats["studio_no_canonical"] += 1
|
||
no_match_studios[parsed.studio] = no_match_studios.get(parsed.studio, 0) + 1
|
||
continue
|
||
target_id, target_name = target
|
||
|
||
new_studio_id = None if str(row.studio_id) == target_id else target_id
|
||
new_date = (
|
||
parsed.release_date.isoformat()
|
||
if parsed.release_date is not None and row.release_date is None
|
||
else None
|
||
)
|
||
|
||
if new_studio_id is None and new_date is None:
|
||
stats["already_correct"] += 1
|
||
continue
|
||
if new_studio_id:
|
||
stats["would_fix_studio"] += 1
|
||
if new_date:
|
||
stats["would_fix_date"] += 1
|
||
updates.append((str(row.id), new_studio_id, new_date))
|
||
|
||
if args.commit and updates:
|
||
with session_scope() as session:
|
||
for scene_id, studio_id, rel_date in updates:
|
||
sets = []
|
||
params: dict = {"id": scene_id}
|
||
if studio_id:
|
||
sets.append("studio_id = :sid")
|
||
params["sid"] = studio_id
|
||
stats["fixed_studio"] += 1
|
||
if rel_date:
|
||
sets.append("release_date = :rd")
|
||
params["rd"] = rel_date
|
||
stats["fixed_date"] += 1
|
||
sets.append("updated_at = NOW()")
|
||
session.execute(
|
||
text(f"UPDATE scenes SET {', '.join(sets)} WHERE id = :id"),
|
||
params,
|
||
)
|
||
log.info(" batch %d/%d: committed %d updates",
|
||
i // BATCH + 1, (len(rows) + BATCH - 1) // BATCH, len(updates))
|
||
|
||
log.info("===== STATS =====")
|
||
for k, v in stats.items():
|
||
log.info(" %s: %d", k, v)
|
||
|
||
# Top 30 unparsed studios (na case'y do dodania do studios table jak TPDB nie dostarczy)
|
||
log.info("===== Top 30 parsed studios NOT in canonical (parse OK, lookup miss) =====")
|
||
for studio, n in sorted(no_match_studios.items(), key=lambda x: -x[1])[:30]:
|
||
log.info(" %s: %d scenes", studio, n)
|
||
|
||
log.info("dry_run=%s. Use --commit to apply.", not args.commit)
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|