"""Per 166 review slugs z top-porn-tube-sites: 1. Fetch review page → extract pdude.link Visit URL + rating + score badges 2. Follow pdude.link → real tube domain 3. Cross-check vs nasze 25 tube origins 4. Output JSON: { slug, name, theporndude_rank, theporndude_score, real_domain, in_our_db, our_origin } """ import asyncio import json import re from pathlib import Path from urllib.parse import urlparse import httpx REVIEWS_FILE = Path("theporndude_free_tubes.json") OUT_FILE = Path("theporndude_resolved.json") UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36" async def fetch_review(cli: httpx.AsyncClient, review: dict, rank: int) -> dict: url = f"https://theporndude.com/{review['id']}/{review['slug']}" try: r = await cli.get(url, headers={"User-Agent": UA}) html = r.text except Exception as e: return {**review, "rank": rank, "error": f"fetch_review: {e}"} # Wyciągnij score score_m = re.search(r'class="rate__num">\s*(\d+(?:\.\d+)?)\s*<', html) # Wyciągnij pdude.link visit URL pdude_m = re.search(r'href="(https://pdude\.link/[\w\-\.]+)"', html) # Wyciągnij