goon/scripts/probe_browse_scraper.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

113 lines
4.2 KiB
Python

"""Phash distance probe — gate-keeper test przed włączeniem nowego browse scrapera.
Usage: python scripts/probe_browse_scraper.py <scraper_module>:<class>
e.g. scripts/probe_browse_scraper.py freshporno:FreshpornoScraper
Procedura:
1. Scrap'nij 1 stronę (~20-60 scen) — bez ingestu do DB
2. Dla każdej sceny: download thumbnail → compute phash
3. Policz min Hamming distance vs canonical phashes (z StashDB w DB)
4. Raportuj % scen z Hamming ≤5 (acceptable threshold dla auto-merge)
Decision rule:
- ≥30% scen z Hamming ≤5 → włącz scraper (hot-link studio thumbs, działa jak freshporno)
- <30% → wyłącz (own screenshots, orphan factory jak shyfap)
"""
from __future__ import annotations
import argparse
import importlib
import sys
from sqlalchemy import select
from app.db import session_scope
from app.models.scene import SceneExternalRef, SceneFingerprint
from app.models.source import Source, SourceKind
def hamming_hex(a: str, b: str) -> int:
return bin(int(a, 16) ^ int(b, 16)).count("1")
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("scraper_ref", help="module:ClassName (np. freshporno:FreshpornoScraper)")
ap.add_argument("--pages", type=int, default=1, help="max pages do scrap (default 1)")
args = ap.parse_args()
module_name, class_name = args.scraper_ref.split(":", 1)
mod = importlib.import_module(f"app.connectors.direct_scrapers.{module_name}")
cls = getattr(mod, class_name)
s = cls()
print(f"Testing scraper {s.sitetag} ({class_name}) on max_pages={args.pages}")
# Scrap
scraped: list[tuple[str, str | None]] = [] # (title, phash)
for raw in s.latest_scenes(max_pages=args.pages):
ph = next((f.value for f in raw.fingerprints if f.kind == "phash"), None)
scraped.append((raw.title, ph))
n_scraped = len(scraped)
n_with_phash = sum(1 for _, ph in scraped if ph)
print(f"Scraped {n_scraped} scenes, {n_with_phash} with phash")
if n_with_phash == 0:
print("ERROR: no phashes — scraper / thumbnail issue.")
return 1
# Canonical phashes
with session_scope() as session:
canon = session.execute(
select(SceneFingerprint.value)
.join(SceneExternalRef, SceneExternalRef.scene_id == SceneFingerprint.scene_id)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(
SceneFingerprint.kind == "phash",
Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]),
)
).scalars().all()
canon_phashes = [c for c in canon if len(c) == 16]
print(f"Comparing against {len(canon_phashes)} canonical phashes...")
# Distances
distances: list[tuple[str, int]] = []
for title, ph in scraped:
if not ph:
continue
best = 999
for c in canon_phashes:
try:
d = hamming_hex(ph, c)
if d < best:
best = d
if best == 0:
break # short-circuit
except ValueError:
continue
distances.append((title, best))
# Stats
in_0 = sum(1 for _, d in distances if d == 0)
in_5 = sum(1 for _, d in distances if d <= 5)
in_10 = sum(1 for _, d in distances if d <= 10)
in_15 = sum(1 for _, d in distances if d <= 15)
print(f"\nPhash distance distribution (N={len(distances)}):")
print(f" Hamming = 0: {in_0:3d} ({100*in_0/len(distances):.0f}%) — exact match")
print(f" Hamming ≤ 5: {in_5:3d} ({100*in_5/len(distances):.0f}%) — auto-merge threshold (DEFAULT)")
print(f" Hamming ≤10: {in_10:3d} ({100*in_10/len(distances):.0f}%)")
print(f" Hamming ≤15: {in_15:3d} ({100*in_15/len(distances):.0f}%)")
pct_5 = 100 * in_5 / len(distances)
if pct_5 >= 30:
print(f"\nVERDICT: ENABLE — {pct_5:.0f}% scen ≤Hamming 5 (≥30% threshold)")
return 0
print(f"\nVERDICT: DISABLE — tylko {pct_5:.0f}% scen ≤Hamming 5 (poniżej 30% threshold)")
print("Tube robi własne screenshots, nie matchują canonical. Orphan factory ryzyko.")
return 2
if __name__ == "__main__":
sys.exit(main())