"""One-shot: backfill `release_date` for freshporno scenes that were scraped before the `itemprop="uploadDate"` regex was added. Tło: bug-report 2026-05-20 ("brak Brazzers Exxtra po 15-05") wymusił dodanie `release_date` extracta z `itemprop="uploadDate"` w freshporno connector. Stare scenes (z przed tego patcha) mają `release_date = NULL`, przez co scene_resolver nie liczy date-overlap signal → score < 0.92 → orphan zamiast merged z TPDB canonical. 10468 orphan freshporno scenes (vs 4789 canonical) — 99% bez release_date. Po backfill resolver auto-merge przy następnym bulk-dedup tick. Idempotent: update tylko gdy aktualne `release_date IS NULL` i `uploadDate` ekstrakcja się powiedzie. """ from __future__ import annotations import logging import re from datetime import UTC, date, datetime import httpx from sqlalchemy import select from app.db import session_scope from app.models import Scene from app.models.playback_source import PlaybackSource log = logging.getLogger(__name__) USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/140.0.0.0" _UPLOAD_DATE_RE = re.compile( r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', ) def main() -> int: logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") with session_scope() as session: rows = session.execute( select(Scene.id, PlaybackSource.page_url) .join(PlaybackSource, PlaybackSource.scene_id == Scene.id) .where(PlaybackSource.origin == "tube:freshpornoorg") .where(Scene.release_date.is_(None)) ).all() log.info("freshporno scenes without release_date: %d", len(rows)) client = httpx.Client( timeout=15.0, follow_redirects=True, headers={"User-Agent": USER_AGENT}, ) updated = 0 skipped = 0 errors = 0 for scene_id, page_url in rows: try: r = client.get(page_url) if r.status_code != 200: if r.status_code in (404, 410): skipped += 1 else: errors += 1 continue m = _UPLOAD_DATE_RE.search(r.text) if not m: skipped += 1 continue try: rd = date.fromisoformat(m.group(1)) except ValueError: skipped += 1 continue with session_scope() as s: scene = s.get(Scene, scene_id) if scene is None or scene.release_date is not None: continue scene.release_date = rd updated += 1 if updated % 100 == 0: log.info( "progress: updated=%d skipped=%d errors=%d (%d/%d)", updated, skipped, errors, updated + skipped + errors, len(rows), ) except Exception as e: errors += 1 if errors <= 5: log.warning("scene=%s url=%s failed: %s", scene_id, page_url, e) client.close() log.info( "DONE: candidates=%d updated=%d skipped=%d errors=%d", len(rows), updated, skipped, errors, ) return 0 if __name__ == "__main__": raise SystemExit(main())