diff --git a/scripts/_tube_survey.py b/scripts/_tube_survey.py new file mode 100644 index 0000000..b79996e --- /dev/null +++ b/scripts/_tube_survey.py @@ -0,0 +1,67 @@ +"""Ad-hoc survey kandydatów tube do deep-crawla (SSR-richness). Throwaway research.""" +import re, json, httpx + +UA = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/140 Safari/537.36"} +JL = re.compile(r"]+application/ld\+json[^>]*>(.*?)", re.S | re.I) +DUR = re.compile(r'"duration"\s*:\s*"(PT[^"]+)"') +DUR2 = re.compile(r'(video:duration"\s+content="\d+|"duration"\s*:\s*\d{2,}|>\d{1,2}:\d{2}<)') +PERF = re.compile(r'href="[^"]*/(models?|pornstars?|actress|porn-star)/[a-z0-9_-]+', re.I) +TAGRX = re.compile(r'href="[^"]*/(tags?|categor)[a-z]*/[a-z0-9_-]+', re.I) +CF = re.compile(r"cloudflare|just a moment|cf-chl|captcha|enable javascript", re.I) + + +def vobj(h): + for m in JL.finditer(h): + try: + d = json.loads(m.group(1).strip()) + except Exception: + continue + items = d if isinstance(d, list) else (d.get("@graph", [d]) if isinstance(d, dict) else []) + for o in items: + if isinstance(o, dict) and o.get("@type") == "VideoObject": + return o + return None + + +CANDIDATES = [ + ("xnxx", "https://www.xnxx.com/", r"/video\.?[a-z0-9]+/[a-z0-9_]+", "https://www.xnxx.com"), + ("redtube", "https://www.redtube.com/newest", r"/[0-9]{6,}", "https://www.redtube.com"), + ("drtuber", "https://www.drtuber.com/videos/recent", r"/video/[0-9]+/[a-z0-9-]+", "https://www.drtuber.com"), + ("tube8", "https://www.tube8.com/", r"/[a-z0-9-]+/[0-9]{5,}", "https://www.tube8.com"), + ("nuvid", "https://www.nuvid.com/", r"/video/[0-9]+/[a-z0-9-]+", "https://www.nuvid.com"), + ("porntube", "https://www.porntube.com/videos", r"/videos/[a-z0-9-]+-[0-9]+", "https://www.porntube.com"), + ("anyporn", "https://anyporn.com/latest-updates/", r"/[0-9]+/", "https://anyporn.com"), + ("motherless", "https://motherless.com/new/videos", r"/[0-9A-F]{6,}", "https://motherless.com"), + ("sexvid", "https://www.sexvid.xxx/latest-updates/", r"/v/[a-z0-9-]+", "https://www.sexvid.xxx"), + ("pornone", "https://pornone.com/recent/", r"/[a-z0-9-]+/[0-9]+/", "https://pornone.com"), +] + + +def yn(b): + return "Y" if b else "-" + + +with httpx.Client(timeout=15, headers=UA, follow_redirects=True) as c: + for name, listing, rgx, base in CANDIDATES: + try: + r = c.get(listing) + except Exception as e: + print(f"{name:10} LISTING-ERR {str(e)[:40]}") + continue + if r.status_code != 200: + print(f"{name:10} HTTP {r.status_code} (block/redirect)") + continue + m = re.search(rgx, r.text) + if not m: + print(f"{name:10} no-scene-link {'(CF/JS)' if CF.search(r.text) else ''}") + continue + su = m.group(0) + su = base + su if su.startswith("/") else su + try: + h = c.get(su).text + except Exception as e: + print(f"{name:10} DETAIL-ERR {str(e)[:30]}") + continue + o = vobj(h) + has_dur = bool((o and o.get("duration")) or DUR.search(h) or DUR2.search(h)) + print(f"{name:10} OK jsonld={yn(o)} dur={yn(has_dur)} perf={yn(PERF.search(h))} tags={yn(TAGRX.search(h))} {su[:55]}")