Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
116 lines
4.7 KiB
Python
116 lines
4.7 KiB
Python
"""Pilot test dla nowych browse-mode scraperów (porn00.org, pornxp.ph).
|
|
|
|
Fetcha N scen z każdego, agreguje % scen z istotnymi sygnałami (studio,
|
|
performer, year, duration, stream_url) + pokazuje 3 sample scen pełnym detailem.
|
|
|
|
Cel: zdecydować czy włączyć w `ALL_BROWSE_SCRAPERS` — jeśli ≥50% scen ma
|
|
studio+performer+year → cel scoring composite (auto-merge >0.85 możliwy).
|
|
|
|
Uruchomienie:
|
|
docker exec goon-api-1 python -m scripts.pilot_browse_scrapers --limit 20
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
from collections import Counter
|
|
|
|
from app.connectors.direct_scrapers.porn00 import Porn00Scraper
|
|
from app.connectors.direct_scrapers.pornxp import PornXPScraper
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def _summarize(scenes: list, scraper_name: str) -> dict:
|
|
n = len(scenes)
|
|
if n == 0:
|
|
return {"scraper": scraper_name, "scenes": 0, "error": "empty"}
|
|
|
|
has_studio = sum(1 for s in scenes if s.studio)
|
|
has_perf = sum(1 for s in scenes if s.performers)
|
|
has_perf_2plus = sum(1 for s in scenes if len(s.performers) >= 2)
|
|
has_date = sum(1 for s in scenes if s.release_date)
|
|
has_dur = sum(1 for s in scenes if s.duration_sec)
|
|
has_tags = sum(1 for s in scenes if s.tags)
|
|
has_thumb = sum(1 for s in scenes if any(p.thumbnail_url for p in s.playback_sources))
|
|
has_stream = sum(1 for s in scenes if any(p.stream_url for p in s.playback_sources))
|
|
has_phash = sum(1 for s in scenes if any(f.kind == "phash" for f in s.fingerprints))
|
|
|
|
durations = [s.duration_sec for s in scenes if s.duration_sec]
|
|
avg_dur = sum(durations) // len(durations) if durations else 0
|
|
perf_counts = Counter(len(s.performers) for s in scenes)
|
|
|
|
return {
|
|
"scraper": scraper_name,
|
|
"scenes_fetched": n,
|
|
"pct": {
|
|
"studio": f"{has_studio}/{n} ({100 * has_studio // n}%)",
|
|
"performer(1+)": f"{has_perf}/{n} ({100 * has_perf // n}%)",
|
|
"performer(2+)": f"{has_perf_2plus}/{n} ({100 * has_perf_2plus // n}%)",
|
|
"release_date": f"{has_date}/{n} ({100 * has_date // n}%)",
|
|
"duration": f"{has_dur}/{n} ({100 * has_dur // n}%)",
|
|
"tags": f"{has_tags}/{n} ({100 * has_tags // n}%)",
|
|
"thumbnail": f"{has_thumb}/{n} ({100 * has_thumb // n}%)",
|
|
"stream_url": f"{has_stream}/{n} ({100 * has_stream // n}%)",
|
|
"phash": f"{has_phash}/{n} ({100 * has_phash // n}%)",
|
|
},
|
|
"avg_duration_sec": avg_dur,
|
|
"performer_distribution": dict(perf_counts.most_common()),
|
|
}
|
|
|
|
|
|
def _sample(scenes: list, n: int = 3) -> list:
|
|
"""Pokazuje N pierwszych scen w czytelnej formie."""
|
|
out = []
|
|
for s in scenes[:n]:
|
|
out.append({
|
|
"title": s.title[:80],
|
|
"external_id": s.external_id,
|
|
"studio": s.studio.name if s.studio else None,
|
|
"performers": [p.name for p in s.performers],
|
|
"release_date": s.release_date.isoformat() if s.release_date else None,
|
|
"duration": s.duration_sec,
|
|
"tags": [t.name for t in s.tags[:5]],
|
|
"stream_url": (s.playback_sources[0].stream_url[:80] if s.playback_sources and s.playback_sources[0].stream_url else None),
|
|
"thumbnail_url": (s.playback_sources[0].thumbnail_url[:80] if s.playback_sources and s.playback_sources[0].thumbnail_url else None),
|
|
"phash": next((f.value for f in s.fingerprints if f.kind == "phash"), None),
|
|
})
|
|
return out
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--limit", type=int, default=20, help="Scenes per scraper")
|
|
parser.add_argument("--only", choices=["porn00", "pornxp"], help="Test only one scraper")
|
|
args = parser.parse_args()
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
|
|
|
scrapers = []
|
|
if args.only != "pornxp":
|
|
scrapers.append(("porn00", Porn00Scraper()))
|
|
if args.only != "porn00":
|
|
scrapers.append(("pornxp", PornXPScraper()))
|
|
|
|
for name, scraper in scrapers:
|
|
log.info("=== Pilot %s ===", name)
|
|
scenes = []
|
|
try:
|
|
for s in scraper.latest_scenes(max_pages=1):
|
|
scenes.append(s)
|
|
if len(scenes) >= args.limit:
|
|
break
|
|
except Exception as e:
|
|
log.error("%s fetch failed: %s", name, e)
|
|
continue
|
|
|
|
print(f"\n========== {name.upper()} ==========")
|
|
print(json.dumps(_summarize(scenes, name), indent=2, ensure_ascii=False))
|
|
print(f"\n--- {name} samples ---")
|
|
print(json.dumps(_sample(scenes, n=3), indent=2, ensure_ascii=False))
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|