goon/scripts/theporndude_coverage_match.py
https://github.com/goon-foss/goon 642f1ab8b8 Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector
Mobile / OTA:
- Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org
- Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version)
- backend.ts: default public backend auto-connect (no manual login)

WebView fallback fix (PlayerScreen INJECTED_JS):
- Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init)
- Context-scoped: only clicks consent buttons inside cookie/gdpr containers
- Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init)

Resolver:
- Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited)
  → composite_score hard-reject / cap; wired into scene_score + bulk_dedup
- aggregator-mode candidate query: LIMIT 500 + title-match ordering

Connectors:
- porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot

landing: APK links → goon-v0.1.9.apk

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 11:20:57 +02:00

118 lines
4.6 KiB
Python

"""Cross-check 166 resolved theporndude domains vs nasze 25 tube origins."""
import json
from pathlib import Path
# Origins z DB (live + dead) + extractor REGISTRY w app/extractors/__init__.py
OUR_ORIGINS = [
# DB live + dead
"tube:0dayxxcom", "tube:epornercom", "tube:fpoxxx", "tube:freshpornoorg",
"tube:hqpornercom", "tube:latestpornvideocom", "tube:mypornerleakcom",
"tube:perverzijacom", "tube:porn00org", "tube:porndishcom", "tube:porndittcom",
"tube:pornhatcom", "tube:pornhubcom", "tube:porntrexcom", "tube:pornxpph",
"tube:redtubecom", "tube:sxylandcom", "tube:sxyprncom", "tube:xhamstercom",
"tube:xnxxcom", "tube:xvideoscom", "tube:youporncom", "tube:latestleaksco",
"tube:siskavideo", "tube:hdporn92com",
# REGISTRY only (extractor known, brak playback w live DB)
"tube:xmoviesforyoucom", "tube:watchporn", "tube:porn4dayspw",
"tube:paradisehillcc",
]
# Tylko realne TLD-y. NIE "tube"/"porn"/"xxx" bo to często części nazwy (redtube, pornhub, fpoxxx).
_TLD_RE = __import__("re").compile(r"(com|net|org|tv|cc|pw|co|to|ws|me|sx|info|biz)$")
def _strip_tld(s: str) -> str:
"""xvideoscom -> xvideos; pornhubcom -> pornhub; hdporn92com -> hdporn92"""
return _TLD_RE.sub("", s)
# Build sitetag → matching variants for fuzzy match
def origin_to_sitetag(origin: str) -> str:
return origin.replace("tube:", "")
def domain_to_sitetag(domain: str) -> str:
"""xvideos.com -> xvideoscom, porntrex.com -> porntrexcom"""
return domain.lower().replace(".", "").replace("-", "")
def match(slug: str, domain: str) -> str | None:
"""Match po `slug` (z theporndude review URL) lub `real_domain` (z pdude.link).
Slug to nazwa tube'a (np. 'xvideos', 'pornhub', 'paradisehill').
Origin format: tube:<sitetag>, gdzie sitetag = domain.replace('.', '').
Match na "slug pasuje do sitetag bez TLD" daje dobry recall.
"""
candidates = []
if slug:
candidates.append(slug.lower().replace("-", ""))
if domain:
candidates.append(domain_to_sitetag(domain))
if not candidates:
return None
for o in OUR_ORIGINS:
st = origin_to_sitetag(o)
st_no_tld = _strip_tld(st)
for c in candidates:
c_no_tld = _strip_tld(c)
if c_no_tld == st_no_tld and len(c_no_tld) >= 3:
return o
return None
def main():
data = json.loads(Path("theporndude_resolved.json").read_text())
have = []
new = []
error = []
for r in data:
if "error" in r and not r.get("real_domain"):
error.append(r)
continue
domain = r.get("real_domain", "")
our = match(r.get("slug", ""), domain)
r["our_origin"] = our
if our:
have.append(r)
else:
new.append(r)
print(f"=== Coverage ===")
print(f"Total theporndude top-porn-tubes: {len(data)}")
print(f" Already in our DB: {len(have)}")
print(f" NEW (potential candidates): {len(new)}")
print(f" Errors: {len(error)}")
print()
print(f"=== Already have (matched) — top 30 by theporndude rank ===")
for r in sorted(have, key=lambda x: x["rank"])[:30]:
print(
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
f"{r['real_domain']:<28} -> {r['our_origin']}"
)
print()
print(f"=== NEW candidates (not in DB) — top 60 by theporndude rank ===")
for r in sorted(new, key=lambda x: x["rank"])[:60]:
print(
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
f"{r.get('real_domain') or '?':<30} ({r['slug']})"
)
# Output detailed
summary = {
"total": len(data),
"already_have": [{"rank": r["rank"], "slug": r["slug"], "domain": r["real_domain"],
"score": r.get("theporndude_score"), "our_origin": r["our_origin"]}
for r in sorted(have, key=lambda x: x["rank"])],
"new_candidates": [{"rank": r["rank"], "slug": r["slug"], "domain": r.get("real_domain"),
"score": r.get("theporndude_score"),
"final_url": r.get("final_url", "")}
for r in sorted(new, key=lambda x: x["rank"])],
"errors": [{"rank": r["rank"], "slug": r["slug"], "error": r.get("error")}
for r in error],
}
Path("theporndude_coverage.json").write_text(json.dumps(summary, indent=2))
print(f"\n-> theporndude_coverage.json")
if __name__ == "__main__":
main()