"""Pilot (Faza 1) — deep-crawl porndoe poza najnowsze strony, żeby zmierzyć WARTOŚĆ pełnego crawlu tube'a (vs obecne search+top-N). Mamy ~3% katalogu porndoe (1959/62k+). Crawluje strony START..END (domyślnie 64+, czyli ogon którego jeszcze nie mamy), przepuszcza przez normalny `_process_scene` (resolver: match canonical / orphan + tagi + duration). Mierzy counters. NIE modyfikuje produkcyjnych jobów — to ad-hoc pomiar. Użycie: python scripts/pilot_porndoe_deepcrawl.py --start 64 --end 110 """ from __future__ import annotations import argparse import logging import sys import time from app.connectors.direct_scrapers.porndoe import PornDoeScraper from app.db import session_scope from app.extractors import browser_get from app.ingest import _process_scene, get_or_create_source from app.models.source import SourceKind logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger("pilot_porndoe") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--start", type=int, default=64) ap.add_argument("--end", type=int, default=110) args = ap.parse_args() s = PornDoeScraper() with session_scope() as session: src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp") source_id = src.id counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0} t0 = time.time() for page in range(args.start, args.end + 1): try: res = browser_get(s._listing_url(page), timeout=30) html = res.text if hasattr(res, "text") else res except Exception as e: log.warning("listing page %d failed: %s", page, e) continue urls = s._extract_scene_urls(html) if not urls: print(f"empty listing page {page}, stop") break for u in urls: try: r = browser_get(u, timeout=30) dh = r.text if hasattr(r, "text") else r raw = s._parse_detail(u, dh) except Exception: counters["errors"] += 1 continue if raw is None: continue counters["seen"] += 1 try: _process_scene(source_id=source_id, raw_scene=raw, counters=counters) except Exception: counters["errors"] += 1 print(f"page {page}: {counters} ({time.time() - t0:.0f}s)", flush=True) print(f"PILOT DONE pages {args.start}-{args.end}: {counters} elapsed={time.time() - t0:.0f}s") return 0 if __name__ == "__main__": sys.exit(main())