Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
113 lines
4.2 KiB
Python
113 lines
4.2 KiB
Python
"""Phash distance probe — gate-keeper test przed włączeniem nowego browse scrapera.
|
|
|
|
Usage: python scripts/probe_browse_scraper.py <scraper_module>:<class>
|
|
e.g. scripts/probe_browse_scraper.py freshporno:FreshpornoScraper
|
|
|
|
Procedura:
|
|
1. Scrap'nij 1 stronę (~20-60 scen) — bez ingestu do DB
|
|
2. Dla każdej sceny: download thumbnail → compute phash
|
|
3. Policz min Hamming distance vs canonical phashes (z StashDB w DB)
|
|
4. Raportuj % scen z Hamming ≤5 (acceptable threshold dla auto-merge)
|
|
|
|
Decision rule:
|
|
- ≥30% scen z Hamming ≤5 → włącz scraper (hot-link studio thumbs, działa jak freshporno)
|
|
- <30% → wyłącz (own screenshots, orphan factory jak shyfap)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import importlib
|
|
import sys
|
|
|
|
from sqlalchemy import select
|
|
|
|
from app.db import session_scope
|
|
from app.models.scene import SceneExternalRef, SceneFingerprint
|
|
from app.models.source import Source, SourceKind
|
|
|
|
|
|
def hamming_hex(a: str, b: str) -> int:
|
|
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("scraper_ref", help="module:ClassName (np. freshporno:FreshpornoScraper)")
|
|
ap.add_argument("--pages", type=int, default=1, help="max pages do scrap (default 1)")
|
|
args = ap.parse_args()
|
|
|
|
module_name, class_name = args.scraper_ref.split(":", 1)
|
|
mod = importlib.import_module(f"app.connectors.direct_scrapers.{module_name}")
|
|
cls = getattr(mod, class_name)
|
|
s = cls()
|
|
print(f"Testing scraper {s.sitetag} ({class_name}) on max_pages={args.pages}")
|
|
|
|
# Scrap
|
|
scraped: list[tuple[str, str | None]] = [] # (title, phash)
|
|
for raw in s.latest_scenes(max_pages=args.pages):
|
|
ph = next((f.value for f in raw.fingerprints if f.kind == "phash"), None)
|
|
scraped.append((raw.title, ph))
|
|
|
|
n_scraped = len(scraped)
|
|
n_with_phash = sum(1 for _, ph in scraped if ph)
|
|
print(f"Scraped {n_scraped} scenes, {n_with_phash} with phash")
|
|
|
|
if n_with_phash == 0:
|
|
print("ERROR: no phashes — scraper / thumbnail issue.")
|
|
return 1
|
|
|
|
# Canonical phashes
|
|
with session_scope() as session:
|
|
canon = session.execute(
|
|
select(SceneFingerprint.value)
|
|
.join(SceneExternalRef, SceneExternalRef.scene_id == SceneFingerprint.scene_id)
|
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
|
.where(
|
|
SceneFingerprint.kind == "phash",
|
|
Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]),
|
|
)
|
|
).scalars().all()
|
|
canon_phashes = [c for c in canon if len(c) == 16]
|
|
|
|
print(f"Comparing against {len(canon_phashes)} canonical phashes...")
|
|
|
|
# Distances
|
|
distances: list[tuple[str, int]] = []
|
|
for title, ph in scraped:
|
|
if not ph:
|
|
continue
|
|
best = 999
|
|
for c in canon_phashes:
|
|
try:
|
|
d = hamming_hex(ph, c)
|
|
if d < best:
|
|
best = d
|
|
if best == 0:
|
|
break # short-circuit
|
|
except ValueError:
|
|
continue
|
|
distances.append((title, best))
|
|
|
|
# Stats
|
|
in_0 = sum(1 for _, d in distances if d == 0)
|
|
in_5 = sum(1 for _, d in distances if d <= 5)
|
|
in_10 = sum(1 for _, d in distances if d <= 10)
|
|
in_15 = sum(1 for _, d in distances if d <= 15)
|
|
|
|
print(f"\nPhash distance distribution (N={len(distances)}):")
|
|
print(f" Hamming = 0: {in_0:3d} ({100*in_0/len(distances):.0f}%) — exact match")
|
|
print(f" Hamming ≤ 5: {in_5:3d} ({100*in_5/len(distances):.0f}%) — auto-merge threshold (DEFAULT)")
|
|
print(f" Hamming ≤10: {in_10:3d} ({100*in_10/len(distances):.0f}%)")
|
|
print(f" Hamming ≤15: {in_15:3d} ({100*in_15/len(distances):.0f}%)")
|
|
|
|
pct_5 = 100 * in_5 / len(distances)
|
|
if pct_5 >= 30:
|
|
print(f"\nVERDICT: ENABLE — {pct_5:.0f}% scen ≤Hamming 5 (≥30% threshold)")
|
|
return 0
|
|
print(f"\nVERDICT: DISABLE — tylko {pct_5:.0f}% scen ≤Hamming 5 (poniżej 30% threshold)")
|
|
print("Tube robi własne screenshots, nie matchują canonical. Orphan factory ryzyko.")
|
|
return 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|