Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
173 lines
6.2 KiB
Python
173 lines
6.2 KiB
Python
"""Bulk rescrape hqporner scen: fetch detail → wyciągnij title (z `<title>`),
|
|
duration (meta description), thumb, tagi (meta description). Update DB.
|
|
|
|
Targets: scenes z origin tube:hqpornercom + 0 tagów (~26k aktualnie). Hqporner
|
|
scraper yieldsuje thin RawScene (slug-derived title, brak detail enrichment) —
|
|
rescrape jest jedynym sposobem na fill metadata.
|
|
|
|
404 → mark playback dead (`dead_at = now`, `dead_reason = "tube page 404"`).
|
|
|
|
Uruchomienie: docker exec goon-api-1 python -m scripts.bulk_rescrape_hqporner [--limit=N] [--throttle=0.3]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import re
|
|
import time
|
|
import uuid
|
|
from datetime import UTC, datetime
|
|
|
|
import sqlalchemy as sa
|
|
|
|
from app.db import session_scope
|
|
from app.extractors._fetch import browser_get
|
|
from app.extractors.duration_extract import extract_duration_sec
|
|
from app.extractors.tag_extract import extract_tags
|
|
from app.extractors.thumb_extract import extract_thumbnail_url
|
|
from app.models.playback_source import PlaybackSource
|
|
from app.models.scene import Scene, SceneTag
|
|
from app.normalize.scenes import NormalizedTag
|
|
from app.normalize.text import slugify
|
|
from app.resolve.tag_resolver import resolve_tag
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_TITLE_RE = re.compile(r"<title[^>]*>([^<]+)</title>", re.IGNORECASE)
|
|
_TITLE_SUFFIX_RE = re.compile(r"\s*-\s*HQporner\.com\s*$", re.IGNORECASE)
|
|
|
|
|
|
def _cleanup_title(html_title: str) -> str:
|
|
"""`Mofos - Denise Sky Is Followed Home - HQporner.com` → `Mofos - Denise Sky Is Followed Home`."""
|
|
return _TITLE_SUFFIX_RE.sub("", html_title.strip())
|
|
|
|
|
|
def rescrape_scene(session, scene_id: uuid.UUID, pb_id: uuid.UUID, page_url: str) -> dict:
|
|
"""Re-fetch detail page + sync metadata do DB. Zwraca counters."""
|
|
out = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0}
|
|
try:
|
|
r = browser_get(page_url, timeout=15.0, follow_redirects=True)
|
|
except Exception as e:
|
|
log.debug("fetch fail %s: %s", page_url, e)
|
|
return out
|
|
|
|
if r.status_code == 404 or r.status_code == 410:
|
|
# Mark playback dead.
|
|
session.execute(
|
|
sa.update(PlaybackSource)
|
|
.where(PlaybackSource.id == pb_id)
|
|
.values(
|
|
dead_at=datetime.now(UTC),
|
|
dead_reason=f"tube page {r.status_code}",
|
|
)
|
|
)
|
|
out["marked_dead"] = 1
|
|
return out
|
|
|
|
if r.status_code >= 400:
|
|
return out
|
|
|
|
html = r.text
|
|
|
|
# Title — jeśli current ma triple space (slug-derived), zastąp z <title>.
|
|
if (tm := _TITLE_RE.search(html)):
|
|
new_title = _cleanup_title(tm.group(1))
|
|
if new_title:
|
|
scene = session.get(Scene, scene_id)
|
|
if scene and (" " in scene.title or scene.title.strip() != new_title):
|
|
# Update jeśli current jest "ucięty" (triple space) lub innego brzmienia.
|
|
# Bezpieczna heurystyka: nie nadpisuj jeśli current jest dłuższy.
|
|
if " " in scene.title or len(new_title) > len(scene.title):
|
|
scene.title = new_title
|
|
scene.title_normalized = new_title.lower()
|
|
out["updated_title"] = 1
|
|
|
|
# Duration
|
|
dur = extract_duration_sec(html)
|
|
if dur:
|
|
scene = session.get(Scene, scene_id)
|
|
if scene and not scene.duration_sec:
|
|
scene.duration_sec = dur
|
|
out["updated_dur"] = 1
|
|
|
|
# Thumb
|
|
thumb = extract_thumbnail_url(html)
|
|
if thumb:
|
|
pb = session.get(PlaybackSource, pb_id)
|
|
if pb and not pb.thumbnail_url:
|
|
pb.thumbnail_url = thumb
|
|
out["updated_thumb"] = 1
|
|
|
|
# Tags
|
|
tag_names = extract_tags("hqpornercom", html)
|
|
if tag_names:
|
|
existing_tag_ids = set(session.execute(
|
|
sa.text("SELECT tag_id FROM scene_tags WHERE scene_id = :sid"),
|
|
{"sid": scene_id},
|
|
).scalars().all())
|
|
for name in tag_names:
|
|
norm_tag = NormalizedTag(
|
|
external_id=f"hqpornercom:tag:{slugify(name)}",
|
|
name=name,
|
|
slug=slugify(name),
|
|
)
|
|
tag = resolve_tag(session, norm=norm_tag)
|
|
if tag is None or tag.id in existing_tag_ids:
|
|
continue
|
|
session.execute(
|
|
sa.text(
|
|
"INSERT INTO scene_tags(scene_id, tag_id) VALUES(:sid, :tid) "
|
|
"ON CONFLICT DO NOTHING"
|
|
),
|
|
{"sid": scene_id, "tid": tag.id},
|
|
)
|
|
existing_tag_ids.add(tag.id)
|
|
out["added_tags"] += 1
|
|
|
|
return out
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--limit", type=int, default=99999)
|
|
parser.add_argument("--throttle", type=float, default=0.3)
|
|
args = parser.parse_args()
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
with session_scope() as session:
|
|
rows = session.execute(
|
|
sa.text(
|
|
"SELECT s.id, ps.id AS pb_id, ps.page_url "
|
|
"FROM scenes s "
|
|
"JOIN playback_sources ps ON ps.scene_id = s.id "
|
|
"WHERE ps.origin = 'tube:hqpornercom' "
|
|
"AND ps.dead_at IS NULL "
|
|
"AND (SELECT COUNT(*) FROM scene_tags WHERE scene_id = s.id) = 0 "
|
|
"LIMIT :limit"
|
|
),
|
|
{"limit": args.limit},
|
|
).all()
|
|
|
|
total = len(rows)
|
|
log.info("hqporner bulk rescrape: %d scenes targeted", total)
|
|
counters = {"updated_title": 0, "updated_dur": 0, "updated_thumb": 0, "added_tags": 0, "marked_dead": 0, "checked": 0}
|
|
|
|
for i, (scene_id, pb_id, page_url) in enumerate(rows, 1):
|
|
counters["checked"] += 1
|
|
with session_scope() as session:
|
|
try:
|
|
r = rescrape_scene(session, scene_id, pb_id, page_url)
|
|
for k, v in r.items():
|
|
counters[k] += v
|
|
except Exception as e:
|
|
log.warning("scene %s: %s", scene_id, e)
|
|
if i % 50 == 0:
|
|
log.info("progress %d/%d counters=%s", i, total, counters)
|
|
time.sleep(args.throttle)
|
|
|
|
log.info("done: %s", counters)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|