Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
87 lines
3 KiB
Python
87 lines
3 KiB
Python
"""Repair tytułów uciętych po wewnętrznym apostrofie.
|
|
|
|
Bug w `meta_content` regex (fixed 2026-05-20): `[^"\']*` tnął content po
|
|
wewnętrznym apostrofie → `She's So Insatiable` → `She`.
|
|
|
|
Skrypt re-fetches detail page dla scen z tube:freshpornoorg/porn00org/pornxpph
|
|
z podejrzanie krótkim tytułem i updateuje jeśli og:title (po fix) jest dłuższy.
|
|
|
|
Uruchomienie: docker exec goon-worker-1 python -m scripts.repair_truncated_titles [--origin tube:freshpornoorg] [--limit 1000]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import time
|
|
|
|
import sqlalchemy as sa
|
|
|
|
from app.connectors.direct_scrapers._browse_base import meta_content
|
|
from app.db import session_scope
|
|
from app.extractors._fetch import browser_get
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def main() -> int:
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--origin", default="tube:freshpornoorg")
|
|
p.add_argument("--limit", type=int, default=5000)
|
|
p.add_argument("--throttle", type=float, default=0.3)
|
|
args = p.parse_args()
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
with session_scope() as session:
|
|
rows = session.execute(
|
|
sa.text(
|
|
"SELECT s.id, s.title, ps.page_url "
|
|
"FROM scenes s JOIN playback_sources ps ON ps.scene_id = s.id "
|
|
"WHERE ps.origin = :origin AND ps.dead_at IS NULL "
|
|
"AND LENGTH(s.title) < 25 "
|
|
"LIMIT :limit"
|
|
),
|
|
{"origin": args.origin, "limit": args.limit},
|
|
).all()
|
|
|
|
log.info("repair titles: %d candidates for %s", len(rows), args.origin)
|
|
counters = {"checked": 0, "updated": 0, "skipped_404": 0, "skipped_same": 0}
|
|
for i, (scene_id, old_title, page_url) in enumerate(rows, 1):
|
|
counters["checked"] += 1
|
|
try:
|
|
r = browser_get(page_url, timeout=15.0, follow_redirects=True)
|
|
except Exception as e:
|
|
log.debug("fetch fail %s: %s", page_url, e)
|
|
continue
|
|
|
|
if r.status_code in (404, 410):
|
|
counters["skipped_404"] += 1
|
|
continue
|
|
if r.status_code >= 400:
|
|
continue
|
|
|
|
new_title = meta_content(r.text, property="og:title")
|
|
if not new_title:
|
|
continue
|
|
# Only update if longer (i.e. parser found full title after fix)
|
|
if len(new_title) <= len(old_title):
|
|
counters["skipped_same"] += 1
|
|
continue
|
|
|
|
with session_scope() as session:
|
|
session.execute(
|
|
sa.text(
|
|
"UPDATE scenes SET title = :t, title_normalized = LOWER(:t) WHERE id = :sid"
|
|
),
|
|
{"t": new_title.strip(), "sid": scene_id},
|
|
)
|
|
counters["updated"] += 1
|
|
if i % 25 == 0:
|
|
log.info("progress %d/%d %s", i, len(rows), counters)
|
|
time.sleep(args.throttle)
|
|
|
|
log.info("done: %s", counters)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|