"""Pilot test dla nowych browse-mode scraperów (porn00.org, pornxp.ph). Fetcha N scen z każdego, agreguje % scen z istotnymi sygnałami (studio, performer, year, duration, stream_url) + pokazuje 3 sample scen pełnym detailem. Cel: zdecydować czy włączyć w `ALL_BROWSE_SCRAPERS` — jeśli ≥50% scen ma studio+performer+year → cel scoring composite (auto-merge >0.85 możliwy). Uruchomienie: docker exec goon-api-1 python -m scripts.pilot_browse_scrapers --limit 20 """ from __future__ import annotations import argparse import json import logging from collections import Counter from app.connectors.direct_scrapers.porn00 import Porn00Scraper from app.connectors.direct_scrapers.pornxp import PornXPScraper log = logging.getLogger(__name__) def _summarize(scenes: list, scraper_name: str) -> dict: n = len(scenes) if n == 0: return {"scraper": scraper_name, "scenes": 0, "error": "empty"} has_studio = sum(1 for s in scenes if s.studio) has_perf = sum(1 for s in scenes if s.performers) has_perf_2plus = sum(1 for s in scenes if len(s.performers) >= 2) has_date = sum(1 for s in scenes if s.release_date) has_dur = sum(1 for s in scenes if s.duration_sec) has_tags = sum(1 for s in scenes if s.tags) has_thumb = sum(1 for s in scenes if any(p.thumbnail_url for p in s.playback_sources)) has_stream = sum(1 for s in scenes if any(p.stream_url for p in s.playback_sources)) has_phash = sum(1 for s in scenes if any(f.kind == "phash" for f in s.fingerprints)) durations = [s.duration_sec for s in scenes if s.duration_sec] avg_dur = sum(durations) // len(durations) if durations else 0 perf_counts = Counter(len(s.performers) for s in scenes) return { "scraper": scraper_name, "scenes_fetched": n, "pct": { "studio": f"{has_studio}/{n} ({100 * has_studio // n}%)", "performer(1+)": f"{has_perf}/{n} ({100 * has_perf // n}%)", "performer(2+)": f"{has_perf_2plus}/{n} ({100 * has_perf_2plus // n}%)", "release_date": f"{has_date}/{n} ({100 * has_date // n}%)", "duration": f"{has_dur}/{n} ({100 * has_dur // n}%)", "tags": f"{has_tags}/{n} ({100 * has_tags // n}%)", "thumbnail": f"{has_thumb}/{n} ({100 * has_thumb // n}%)", "stream_url": f"{has_stream}/{n} ({100 * has_stream // n}%)", "phash": f"{has_phash}/{n} ({100 * has_phash // n}%)", }, "avg_duration_sec": avg_dur, "performer_distribution": dict(perf_counts.most_common()), } def _sample(scenes: list, n: int = 3) -> list: """Pokazuje N pierwszych scen w czytelnej formie.""" out = [] for s in scenes[:n]: out.append({ "title": s.title[:80], "external_id": s.external_id, "studio": s.studio.name if s.studio else None, "performers": [p.name for p in s.performers], "release_date": s.release_date.isoformat() if s.release_date else None, "duration": s.duration_sec, "tags": [t.name for t in s.tags[:5]], "stream_url": (s.playback_sources[0].stream_url[:80] if s.playback_sources and s.playback_sources[0].stream_url else None), "thumbnail_url": (s.playback_sources[0].thumbnail_url[:80] if s.playback_sources and s.playback_sources[0].thumbnail_url else None), "phash": next((f.value for f in s.fingerprints if f.kind == "phash"), None), }) return out def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--limit", type=int, default=20, help="Scenes per scraper") parser.add_argument("--only", choices=["porn00", "pornxp"], help="Test only one scraper") args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s") scrapers = [] if args.only != "pornxp": scrapers.append(("porn00", Porn00Scraper())) if args.only != "porn00": scrapers.append(("pornxp", PornXPScraper())) for name, scraper in scrapers: log.info("=== Pilot %s ===", name) scenes = [] try: for s in scraper.latest_scenes(max_pages=1): scenes.append(s) if len(scenes) >= args.limit: break except Exception as e: log.error("%s fetch failed: %s", name, e) continue print(f"\n========== {name.upper()} ==========") print(json.dumps(_summarize(scenes, name), indent=2, ensure_ascii=False)) print(f"\n--- {name} samples ---") print(json.dumps(_sample(scenes, n=3), indent=2, ensure_ascii=False)) return 0 if __name__ == "__main__": raise SystemExit(main())