Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
34 lines
1.6 KiB
Python
34 lines
1.6 KiB
Python
"""One-off: probe pornxp listing regex parts. DELETE after fixing scraper."""
|
|
from app.extractors._fetch import browser_get
|
|
import re
|
|
|
|
r = browser_get("https://pornxp.ph/", timeout=20.0, follow_redirects=True)
|
|
html = r.text
|
|
print(f"len: {len(html)}")
|
|
|
|
patterns = [
|
|
("data-id alone",
|
|
r'<div class="item preview"\s+data-id="(\d+)"'),
|
|
("data-id+url",
|
|
r'<div class="item preview"\s+data-id="\d+".{0,500}?<a href="(/videos/\d+)"'),
|
|
("data-id+url+img",
|
|
r'<div class="item preview"\s+data-id="\d+".{0,800}?<a href="/videos/\d+"[^>]*>.*?<img class="item_img"\s*src="([^"]+)"'),
|
|
("data-id+url+img+dur",
|
|
r'<div class="item preview"\s+data-id="\d+".{0,1000}?<div class="item_dur">([^<]+)</div>'),
|
|
("data-id+url+img+dur+title",
|
|
r'<div class="item preview"\s+data-id="\d+".{0,1500}?<div class="item_title">([^<]+)</div>'),
|
|
("full original",
|
|
r'<div class="item preview"\s+data-id="(?P<id>\d+)"(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>\s*<a href="(?P<url>/videos/\d+)"[^>]*>.*?<img class="item_img"\s+src="(?P<thumb>[^"]+)".*?<div class="item_dur">(?P<dur>[^<]+)</div>.*?<div class="item_title">(?P<title>[^<]+)</div>'),
|
|
]
|
|
for name, p in patterns:
|
|
n = len(list(re.finditer(p, html, re.DOTALL | re.IGNORECASE)))
|
|
print(f"{name}: {n}")
|
|
|
|
# Bonus — peek at 5th card to see if anything quirky
|
|
all_ids = re.findall(r'<div class="item preview"\s+data-id="(\d+)"', html)
|
|
print(f"\nall card ids: {all_ids[:8]}...")
|
|
|
|
# Find card 5 raw — by index
|
|
positions = [m.start() for m in re.finditer(r'<div class="item preview"', html)]
|
|
if len(positions) >= 5:
|
|
print(f"\nCard #5 raw (first 500 chars):\n{html[positions[4]:positions[4]+500]!r}")
|