goon/scripts/theporndude_resolve_domains.py

"""Per 166 review slugs z top-porn-tube-sites:
1. Fetch review page → extract pdude.link Visit URL + rating + score badges
2. Follow pdude.link → real tube domain
3. Cross-check vs nasze 25 tube origins
4. Output JSON: { slug, name, theporndude_rank, theporndude_score, real_domain, in_our_db, our_origin }
"""
import asyncio
import json
import re
from pathlib import Path
from urllib.parse import urlparse

import httpx

REVIEWS_FILE = Path("theporndude_free_tubes.json")
OUT_FILE = Path("theporndude_resolved.json")

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"


async def fetch_review(cli: httpx.AsyncClient, review: dict, rank: int) -> dict:
    url = f"https://theporndude.com/{review['id']}/{review['slug']}"
    try:
        r = await cli.get(url, headers={"User-Agent": UA})
        html = r.text
    except Exception as e:
        return {**review, "rank": rank, "error": f"fetch_review: {e}"}

    # Wyciągnij score
    score_m = re.search(r'class="rate__num">\s*(\d+(?:\.\d+)?)\s*<', html)
    # Wyciągnij pdude.link visit URL
    pdude_m = re.search(r'href="(https://pdude\.link/[\w\-\.]+)"', html)
    # Wyciągnij <title> + meta description
    title_m = re.search(r"<title>([^<]+)</title>", html)
    desc_m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html)

    out = {
        **review,
        "rank": rank,
        "theporndude_score": float(score_m.group(1)) if score_m else None,
        "page_title": (title_m.group(1) if title_m else "")[:120],
        "page_desc": (desc_m.group(1) if desc_m else "")[:200],
    }
    if not pdude_m:
        out["error"] = "no_pdude_link"
        return out
    pdude_url = pdude_m.group(1)

    # Follow pdude.link
    try:
        r2 = await cli.get(pdude_url, headers={"User-Agent": UA})
        # Final URL po wszystkich redirectach
        final_url = str(r2.url)
        host = urlparse(final_url).hostname or ""
        host = host.replace("www.", "")
        out["real_domain"] = host
        out["final_url"] = final_url[:200]
    except Exception as e:
        out["error"] = f"pdude_follow: {e}"
    return out


async def main():
    reviews = json.loads(REVIEWS_FILE.read_text())["reviews"]

    timeout = httpx.Timeout(20.0, connect=10.0)
    limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
    async with httpx.AsyncClient(
        timeout=timeout, limits=limits, follow_redirects=True, http2=False
    ) as cli:
        sem = asyncio.Semaphore(8)

        async def worker(rev, rank):
            async with sem:
                return await fetch_review(cli, rev, rank)

        tasks = [worker(r, i + 1) for i, r in enumerate(reviews)]
        results = await asyncio.gather(*tasks)

    OUT_FILE.write_text(json.dumps(results, indent=2))
    ok = sum(1 for r in results if r.get("real_domain"))
    print(f"resolved {ok}/{len(results)} ({ok*100/len(results):.0f}%)")
    print(f"out -> {OUT_FILE}")


if __name__ == "__main__":
    asyncio.run(main())