goon/scripts/bulk_rescrape_hqporner.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

173 lines
6.2 KiB
Python

"""Bulk rescrape hqporner scen: fetch detail → wyciągnij title (z `<title>`),
duration (meta description), thumb, tagi (meta description). Update DB.
Targets: scenes z origin tube:hqpornercom + 0 tagów (~26k aktualnie). Hqporner
scraper yieldsuje thin RawScene (slug-derived title, brak detail enrichment) —
rescrape jest jedynym sposobem na fill metadata.
404 → mark playback dead (`dead_at = now`, `dead_reason = "tube page 404"`).
Uruchomienie: docker exec goon-api-1 python -m scripts.bulk_rescrape_hqporner [--limit=N] [--throttle=0.3]
"""
from __future__ import annotations
import argparse
import logging
import re
import time
import uuid
from datetime import UTC, datetime
import sqlalchemy as sa
from app.db import session_scope
from app.extractors._fetch import browser_get
from app.extractors.duration_extract import extract_duration_sec
from app.extractors.tag_extract import extract_tags
from app.extractors.thumb_extract import extract_thumbnail_url
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneTag
from app.normalize.scenes import NormalizedTag
from app.normalize.text import slugify
from app.resolve.tag_resolver import resolve_tag
log = logging.getLogger(__name__)
_TITLE_RE = re.compile(r"<title[^>]*>([^<]+)</title>", re.IGNORECASE)
_TITLE_SUFFIX_RE = re.compile(r"\s*-\s*HQporner\.com\s*$", re.IGNORECASE)
def _cleanup_title(html_title: str) -> str:
"""`Mofos - Denise Sky Is Followed Home - HQporner.com` → `Mofos - Denise Sky Is Followed Home`."""
return _TITLE_SUFFIX_RE.sub("", html_title.strip())
def rescrape_scene(session, scene_id: uuid.UUID, pb_id: uuid.UUID, page_url: str) -> dict:
"""Re-fetch detail page + sync metadata do DB. Zwraca counters."""
out = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0}
try:
r = browser_get(page_url, timeout=15.0, follow_redirects=True)
except Exception as e:
log.debug("fetch fail %s: %s", page_url, e)
return out
if r.status_code == 404 or r.status_code == 410:
# Mark playback dead.
session.execute(
sa.update(PlaybackSource)
.where(PlaybackSource.id == pb_id)
.values(
dead_at=datetime.now(UTC),
dead_reason=f"tube page {r.status_code}",
)
)
out["marked_dead"] = 1
return out
if r.status_code >= 400:
return out
html = r.text
# Title — jeśli current ma triple space (slug-derived), zastąp z <title>.
if (tm := _TITLE_RE.search(html)):
new_title = _cleanup_title(tm.group(1))
if new_title:
scene = session.get(Scene, scene_id)
if scene and (" " in scene.title or scene.title.strip() != new_title):
# Update jeśli current jest "ucięty" (triple space) lub innego brzmienia.
# Bezpieczna heurystyka: nie nadpisuj jeśli current jest dłuższy.
if " " in scene.title or len(new_title) > len(scene.title):
scene.title = new_title
scene.title_normalized = new_title.lower()
out["updated_title"] = 1
# Duration
dur = extract_duration_sec(html)
if dur:
scene = session.get(Scene, scene_id)
if scene and not scene.duration_sec:
scene.duration_sec = dur
out["updated_dur"] = 1
# Thumb
thumb = extract_thumbnail_url(html)
if thumb:
pb = session.get(PlaybackSource, pb_id)
if pb and not pb.thumbnail_url:
pb.thumbnail_url = thumb
out["updated_thumb"] = 1
# Tags
tag_names = extract_tags("hqpornercom", html)
if tag_names:
existing_tag_ids = set(session.execute(
sa.text("SELECT tag_id FROM scene_tags WHERE scene_id = :sid"),
{"sid": scene_id},
).scalars().all())
for name in tag_names:
norm_tag = NormalizedTag(
external_id=f"hqpornercom:tag:{slugify(name)}",
name=name,
slug=slugify(name),
)
tag = resolve_tag(session, norm=norm_tag)
if tag is None or tag.id in existing_tag_ids:
continue
session.execute(
sa.text(
"INSERT INTO scene_tags(scene_id, tag_id) VALUES(:sid, :tid) "
"ON CONFLICT DO NOTHING"
),
{"sid": scene_id, "tid": tag.id},
)
existing_tag_ids.add(tag.id)
out["added_tags"] += 1
return out
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, default=99999)
parser.add_argument("--throttle", type=float, default=0.3)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
with session_scope() as session:
rows = session.execute(
sa.text(
"SELECT s.id, ps.id AS pb_id, ps.page_url "
"FROM scenes s "
"JOIN playback_sources ps ON ps.scene_id = s.id "
"WHERE ps.origin = 'tube:hqpornercom' "
"AND ps.dead_at IS NULL "
"AND (SELECT COUNT(*) FROM scene_tags WHERE scene_id = s.id) = 0 "
"LIMIT :limit"
),
{"limit": args.limit},
).all()
total = len(rows)
log.info("hqporner bulk rescrape: %d scenes targeted", total)
counters = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0, "checked": 0}
for i, (scene_id, pb_id, page_url) in enumerate(rows, 1):
counters["checked"] += 1
with session_scope() as session:
try:
r = rescrape_scene(session, scene_id, pb_id, page_url)
for k, v in r.items():
counters[k] += v
except Exception as e:
log.warning("scene %s: %s", scene_id, e)
if i % 50 == 0:
log.info("progress %d/%d counters=%s", i, total, counters)
time.sleep(args.throttle)
log.info("done: %s", counters)
return 0
if __name__ == "__main__":
raise SystemExit(main())