goon/scripts/debug_pornxp_listing.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

34 lines
1.6 KiB
Python

"""One-off: probe pornxp listing regex parts. DELETE after fixing scraper."""
from app.extractors._fetch import browser_get
import re
r = browser_get("https://pornxp.ph/", timeout=20.0, follow_redirects=True)
html = r.text
print(f"len: {len(html)}")
patterns = [
("data-id alone",
r'<div class="item preview"\s+data-id="(\d+)"'),
("data-id+url",
r'<div class="item preview"\s+data-id="\d+".{0,500}?<a href="(/videos/\d+)"'),
("data-id+url+img",
r'<div class="item preview"\s+data-id="\d+".{0,800}?<a href="/videos/\d+"[^>]*>.*?<img class="item_img"\s*src="([^"]+)"'),
("data-id+url+img+dur",
r'<div class="item preview"\s+data-id="\d+".{0,1000}?<div class="item_dur">([^<]+)</div>'),
("data-id+url+img+dur+title",
r'<div class="item preview"\s+data-id="\d+".{0,1500}?<div class="item_title">([^<]+)</div>'),
("full original",
r'<div class="item preview"\s+data-id="(?P<id>\d+)"(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>\s*<a href="(?P<url>/videos/\d+)"[^>]*>.*?<img class="item_img"\s+src="(?P<thumb>[^"]+)".*?<div class="item_dur">(?P<dur>[^<]+)</div>.*?<div class="item_title">(?P<title>[^<]+)</div>'),
]
for name, p in patterns:
n = len(list(re.finditer(p, html, re.DOTALL | re.IGNORECASE)))
print(f"{name}: {n}")
# Bonus — peek at 5th card to see if anything quirky
all_ids = re.findall(r'<div class="item preview"\s+data-id="(\d+)"', html)
print(f"\nall card ids: {all_ids[:8]}...")
# Find card 5 raw — by index
positions = [m.start() for m in re.finditer(r'<div class="item preview"', html)]
if len(positions) >= 5:
print(f"\nCard #5 raw (first 500 chars):\n{html[positions[4]:positions[4]+500]!r}")