"""Bulk rescrape hqporner scen: fetch detail → wyciągnij title (z ``), duration (meta description), thumb, tagi (meta description). Update DB. Targets: scenes z origin tube:hqpornercom + 0 tagów (~26k aktualnie). Hqporner scraper yieldsuje thin RawScene (slug-derived title, brak detail enrichment) — rescrape jest jedynym sposobem na fill metadata. 404 → mark playback dead (`dead_at = now`, `dead_reason = "tube page 404"`). Uruchomienie: docker exec goon-api-1 python -m scripts.bulk_rescrape_hqporner [--limit=N] [--throttle=0.3] """ from __future__ import annotations import argparse import logging import re import time import uuid from datetime import UTC, datetime import sqlalchemy as sa from app.db import session_scope from app.extractors._fetch import browser_get from app.extractors.duration_extract import extract_duration_sec from app.extractors.tag_extract import extract_tags from app.extractors.thumb_extract import extract_thumbnail_url from app.models.playback_source import PlaybackSource from app.models.scene import Scene, SceneTag from app.normalize.scenes import NormalizedTag from app.normalize.text import slugify from app.resolve.tag_resolver import resolve_tag log = logging.getLogger(__name__) _TITLE_RE = re.compile(r"<title[^>]*>([^<]+)", re.IGNORECASE) _TITLE_SUFFIX_RE = re.compile(r"\s*-\s*HQporner\.com\s*$", re.IGNORECASE) def _cleanup_title(html_title: str) -> str: """`Mofos - Denise Sky Is Followed Home - HQporner.com` → `Mofos - Denise Sky Is Followed Home`.""" return _TITLE_SUFFIX_RE.sub("", html_title.strip()) def rescrape_scene(session, scene_id: uuid.UUID, pb_id: uuid.UUID, page_url: str) -> dict: """Re-fetch detail page + sync metadata do DB. Zwraca counters.""" out = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0} try: r = browser_get(page_url, timeout=15.0, follow_redirects=True) except Exception as e: log.debug("fetch fail %s: %s", page_url, e) return out if r.status_code == 404 or r.status_code == 410: # Mark playback dead. session.execute( sa.update(PlaybackSource) .where(PlaybackSource.id == pb_id) .values( dead_at=datetime.now(UTC), dead_reason=f"tube page {r.status_code}", ) ) out["marked_dead"] = 1 return out if r.status_code >= 400: return out html = r.text # Title — jeśli current ma triple space (slug-derived), zastąp z . if (tm := _TITLE_RE.search(html)): new_title = _cleanup_title(tm.group(1)) if new_title: scene = session.get(Scene, scene_id) if scene and (" " in scene.title or scene.title.strip() != new_title): # Update jeśli current jest "ucięty" (triple space) lub innego brzmienia. # Bezpieczna heurystyka: nie nadpisuj jeśli current jest dłuższy. if " " in scene.title or len(new_title) > len(scene.title): scene.title = new_title scene.title_normalized = new_title.lower() out["updated_title"] = 1 # Duration dur = extract_duration_sec(html) if dur: scene = session.get(Scene, scene_id) if scene and not scene.duration_sec: scene.duration_sec = dur out["updated_dur"] = 1 # Thumb thumb = extract_thumbnail_url(html) if thumb: pb = session.get(PlaybackSource, pb_id) if pb and not pb.thumbnail_url: pb.thumbnail_url = thumb out["updated_thumb"] = 1 # Tags tag_names = extract_tags("hqpornercom", html) if tag_names: existing_tag_ids = set(session.execute( sa.text("SELECT tag_id FROM scene_tags WHERE scene_id = :sid"), {"sid": scene_id}, ).scalars().all()) for name in tag_names: norm_tag = NormalizedTag( external_id=f"hqpornercom:tag:{slugify(name)}", name=name, slug=slugify(name), ) tag = resolve_tag(session, norm=norm_tag) if tag is None or tag.id in existing_tag_ids: continue session.execute( sa.text( "INSERT INTO scene_tags(scene_id, tag_id) VALUES(:sid, :tid) " "ON CONFLICT DO NOTHING" ), {"sid": scene_id, "tid": tag.id}, ) existing_tag_ids.add(tag.id) out["added_tags"] += 1 return out def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--limit", type=int, default=99999) parser.add_argument("--throttle", type=float, default=0.3) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") with session_scope() as session: rows = session.execute( sa.text( "SELECT s.id, ps.id AS pb_id, ps.page_url " "FROM scenes s " "JOIN playback_sources ps ON ps.scene_id = s.id " "WHERE ps.origin = 'tube:hqpornercom' " "AND ps.dead_at IS NULL " "AND (SELECT COUNT(*) FROM scene_tags WHERE scene_id = s.id) = 0 " "LIMIT :limit" ), {"limit": args.limit}, ).all() total = len(rows) log.info("hqporner bulk rescrape: %d scenes targeted", total) counters = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0, "checked": 0} for i, (scene_id, pb_id, page_url) in enumerate(rows, 1): counters["checked"] += 1 with session_scope() as session: try: r = rescrape_scene(session, scene_id, pb_id, page_url) for k, v in r.items(): counters[k] += v except Exception as e: log.warning("scene %s: %s", scene_id, e) if i % 50 == 0: log.info("progress %d/%d counters=%s", i, total, counters) time.sleep(args.throttle) log.info("done: %s", counters) return 0 if __name__ == "__main__": raise SystemExit(main())