goon/scripts/pilot_browse_scrapers.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

116 lines
4.7 KiB
Python

"""Pilot test dla nowych browse-mode scraperów (porn00.org, pornxp.ph).
Fetcha N scen z każdego, agreguje % scen z istotnymi sygnałami (studio,
performer, year, duration, stream_url) + pokazuje 3 sample scen pełnym detailem.
Cel: zdecydować czy włączyć w `ALL_BROWSE_SCRAPERS` — jeśli ≥50% scen ma
studio+performer+year → cel scoring composite (auto-merge >0.85 możliwy).
Uruchomienie:
docker exec goon-api-1 python -m scripts.pilot_browse_scrapers --limit 20
"""
from __future__ import annotations
import argparse
import json
import logging
from collections import Counter
from app.connectors.direct_scrapers.porn00 import Porn00Scraper
from app.connectors.direct_scrapers.pornxp import PornXPScraper
log = logging.getLogger(__name__)
def _summarize(scenes: list, scraper_name: str) -> dict:
n = len(scenes)
if n == 0:
return {"scraper": scraper_name, "scenes": 0, "error": "empty"}
has_studio = sum(1 for s in scenes if s.studio)
has_perf = sum(1 for s in scenes if s.performers)
has_perf_2plus = sum(1 for s in scenes if len(s.performers) >= 2)
has_date = sum(1 for s in scenes if s.release_date)
has_dur = sum(1 for s in scenes if s.duration_sec)
has_tags = sum(1 for s in scenes if s.tags)
has_thumb = sum(1 for s in scenes if any(p.thumbnail_url for p in s.playback_sources))
has_stream = sum(1 for s in scenes if any(p.stream_url for p in s.playback_sources))
has_phash = sum(1 for s in scenes if any(f.kind == "phash" for f in s.fingerprints))
durations = [s.duration_sec for s in scenes if s.duration_sec]
avg_dur = sum(durations) // len(durations) if durations else 0
perf_counts = Counter(len(s.performers) for s in scenes)
return {
"scraper": scraper_name,
"scenes_fetched": n,
"pct": {
"studio": f"{has_studio}/{n} ({100 * has_studio // n}%)",
"performer(1+)": f"{has_perf}/{n} ({100 * has_perf // n}%)",
"performer(2+)": f"{has_perf_2plus}/{n} ({100 * has_perf_2plus // n}%)",
"release_date": f"{has_date}/{n} ({100 * has_date // n}%)",
"duration": f"{has_dur}/{n} ({100 * has_dur // n}%)",
"tags": f"{has_tags}/{n} ({100 * has_tags // n}%)",
"thumbnail": f"{has_thumb}/{n} ({100 * has_thumb // n}%)",
"stream_url": f"{has_stream}/{n} ({100 * has_stream // n}%)",
"phash": f"{has_phash}/{n} ({100 * has_phash // n}%)",
},
"avg_duration_sec": avg_dur,
"performer_distribution": dict(perf_counts.most_common()),
}
def _sample(scenes: list, n: int = 3) -> list:
"""Pokazuje N pierwszych scen w czytelnej formie."""
out = []
for s in scenes[:n]:
out.append({
"title": s.title[:80],
"external_id": s.external_id,
"studio": s.studio.name if s.studio else None,
"performers": [p.name for p in s.performers],
"release_date": s.release_date.isoformat() if s.release_date else None,
"duration": s.duration_sec,
"tags": [t.name for t in s.tags[:5]],
"stream_url": (s.playback_sources[0].stream_url[:80] if s.playback_sources and s.playback_sources[0].stream_url else None),
"thumbnail_url": (s.playback_sources[0].thumbnail_url[:80] if s.playback_sources and s.playback_sources[0].thumbnail_url else None),
"phash": next((f.value for f in s.fingerprints if f.kind == "phash"), None),
})
return out
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, default=20, help="Scenes per scraper")
parser.add_argument("--only", choices=["porn00", "pornxp"], help="Test only one scraper")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
scrapers = []
if args.only != "pornxp":
scrapers.append(("porn00", Porn00Scraper()))
if args.only != "porn00":
scrapers.append(("pornxp", PornXPScraper()))
for name, scraper in scrapers:
log.info("=== Pilot %s ===", name)
scenes = []
try:
for s in scraper.latest_scenes(max_pages=1):
scenes.append(s)
if len(scenes) >= args.limit:
break
except Exception as e:
log.error("%s fetch failed: %s", name, e)
continue
print(f"\n========== {name.upper()} ==========")
print(json.dumps(_summarize(scenes, name), indent=2, ensure_ascii=False))
print(f"\n--- {name} samples ---")
print(json.dumps(_sample(scenes, n=3), indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())