goon/scripts/pilot_browse_scrapers.py

"""Pilot test dla nowych browse-mode scraperów (porn00.org, pornxp.ph).

Fetcha N scen z każdego, agreguje % scen z istotnymi sygnałami (studio,
performer, year, duration, stream_url) + pokazuje 3 sample scen pełnym detailem.

Cel: zdecydować czy włączyć w `ALL_BROWSE_SCRAPERS` — jeśli ≥50% scen ma
studio+performer+year → cel scoring composite (auto-merge >0.85 możliwy).

Uruchomienie:
    docker exec goon-api-1 python -m scripts.pilot_browse_scrapers --limit 20
"""
from __future__ import annotations

import argparse
import json
import logging
from collections import Counter

from app.connectors.direct_scrapers.porn00 import Porn00Scraper
from app.connectors.direct_scrapers.pornxp import PornXPScraper

log = logging.getLogger(__name__)


def _summarize(scenes: list, scraper_name: str) -> dict:
    n = len(scenes)
    if n == 0:
        return {"scraper": scraper_name, "scenes": 0, "error": "empty"}

    has_studio = sum(1 for s in scenes if s.studio)
    has_perf = sum(1 for s in scenes if s.performers)
    has_perf_2plus = sum(1 for s in scenes if len(s.performers) >= 2)
    has_date = sum(1 for s in scenes if s.release_date)
    has_dur = sum(1 for s in scenes if s.duration_sec)
    has_tags = sum(1 for s in scenes if s.tags)
    has_thumb = sum(1 for s in scenes if any(p.thumbnail_url for p in s.playback_sources))
    has_stream = sum(1 for s in scenes if any(p.stream_url for p in s.playback_sources))
    has_phash = sum(1 for s in scenes if any(f.kind == "phash" for f in s.fingerprints))

    durations = [s.duration_sec for s in scenes if s.duration_sec]
    avg_dur = sum(durations) // len(durations) if durations else 0
    perf_counts = Counter(len(s.performers) for s in scenes)

    return {
        "scraper": scraper_name,
        "scenes_fetched": n,
        "pct": {
            "studio": f"{has_studio}/{n} ({100 * has_studio // n}%)",
            "performer(1+)": f"{has_perf}/{n} ({100 * has_perf // n}%)",
            "performer(2+)": f"{has_perf_2plus}/{n} ({100 * has_perf_2plus // n}%)",
            "release_date": f"{has_date}/{n} ({100 * has_date // n}%)",
            "duration": f"{has_dur}/{n} ({100 * has_dur // n}%)",
            "tags": f"{has_tags}/{n} ({100 * has_tags // n}%)",
            "thumbnail": f"{has_thumb}/{n} ({100 * has_thumb // n}%)",
            "stream_url": f"{has_stream}/{n} ({100 * has_stream // n}%)",
            "phash": f"{has_phash}/{n} ({100 * has_phash // n}%)",
        },
        "avg_duration_sec": avg_dur,
        "performer_distribution": dict(perf_counts.most_common()),
    }


def _sample(scenes: list, n: int = 3) -> list:
    """Pokazuje N pierwszych scen w czytelnej formie."""
    out = []
    for s in scenes[:n]:
        out.append({
            "title": s.title[:80],
            "external_id": s.external_id,
            "studio": s.studio.name if s.studio else None,
            "performers": [p.name for p in s.performers],
            "release_date": s.release_date.isoformat() if s.release_date else None,
            "duration": s.duration_sec,
            "tags": [t.name for t in s.tags[:5]],
            "stream_url": (s.playback_sources[0].stream_url[:80] if s.playback_sources and s.playback_sources[0].stream_url else None),
            "thumbnail_url": (s.playback_sources[0].thumbnail_url[:80] if s.playback_sources and s.playback_sources[0].thumbnail_url else None),
            "phash": next((f.value for f in s.fingerprints if f.kind == "phash"), None),
        })
    return out


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--limit", type=int, default=20, help="Scenes per scraper")
    parser.add_argument("--only", choices=["porn00", "pornxp"], help="Test only one scraper")
    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")

    scrapers = []
    if args.only != "pornxp":
        scrapers.append(("porn00", Porn00Scraper()))
    if args.only != "porn00":
        scrapers.append(("pornxp", PornXPScraper()))

    for name, scraper in scrapers:
        log.info("=== Pilot %s ===", name)
        scenes = []
        try:
            for s in scraper.latest_scenes(max_pages=1):
                scenes.append(s)
                if len(scenes) >= args.limit:
                    break
        except Exception as e:
            log.error("%s fetch failed: %s", name, e)
            continue

        print(f"\n========== {name.upper()} ==========")
        print(json.dumps(_summarize(scenes, name), indent=2, ensure_ascii=False))
        print(f"\n--- {name} samples ---")
        print(json.dumps(_sample(scenes, n=3), indent=2, ensure_ascii=False))

    return 0


if __name__ == "__main__":
    raise SystemExit(main())