goon/scripts/repair_truncated_titles.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

87 lines
3 KiB
Python

"""Repair tytułów uciętych po wewnętrznym apostrofie.
Bug w `meta_content` regex (fixed 2026-05-20): `[^"\']*` tnął content po
wewnętrznym apostrofie → `She's So Insatiable` → `She`.
Skrypt re-fetches detail page dla scen z tube:freshpornoorg/porn00org/pornxpph
z podejrzanie krótkim tytułem i updateuje jeśli og:title (po fix) jest dłuższy.
Uruchomienie: docker exec goon-worker-1 python -m scripts.repair_truncated_titles [--origin tube:freshpornoorg] [--limit 1000]
"""
from __future__ import annotations
import argparse
import logging
import time
import sqlalchemy as sa
from app.connectors.direct_scrapers._browse_base import meta_content
from app.db import session_scope
from app.extractors._fetch import browser_get
log = logging.getLogger(__name__)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--origin", default="tube:freshpornoorg")
p.add_argument("--limit", type=int, default=5000)
p.add_argument("--throttle", type=float, default=0.3)
args = p.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
with session_scope() as session:
rows = session.execute(
sa.text(
"SELECT s.id, s.title, ps.page_url "
"FROM scenes s JOIN playback_sources ps ON ps.scene_id = s.id "
"WHERE ps.origin = :origin AND ps.dead_at IS NULL "
"AND LENGTH(s.title) < 25 "
"LIMIT :limit"
),
{"origin": args.origin, "limit": args.limit},
).all()
log.info("repair titles: %d candidates for %s", len(rows), args.origin)
counters = {"checked": 0, "updated": 0, "skipped_404": 0, "skipped_same": 0}
for i, (scene_id, old_title, page_url) in enumerate(rows, 1):
counters["checked"] += 1
try:
r = browser_get(page_url, timeout=15.0, follow_redirects=True)
except Exception as e:
log.debug("fetch fail %s: %s", page_url, e)
continue
if r.status_code in (404, 410):
counters["skipped_404"] += 1
continue
if r.status_code >= 400:
continue
new_title = meta_content(r.text, property="og:title")
if not new_title:
continue
# Only update if longer (i.e. parser found full title after fix)
if len(new_title) <= len(old_title):
counters["skipped_same"] += 1
continue
with session_scope() as session:
session.execute(
sa.text(
"UPDATE scenes SET title = :t, title_normalized = LOWER(:t) WHERE id = :sid"
),
{"t": new_title.strip(), "sid": scene_id},
)
counters["updated"] += 1
if i % 25 == 0:
log.info("progress %d/%d %s", i, len(rows), counters)
time.sleep(args.throttle)
log.info("done: %s", counters)
return 0
if __name__ == "__main__":
raise SystemExit(main())