"""One-off: probe pornxp listing regex parts. DELETE after fixing scraper.""" from app.extractors._fetch import browser_get import re r = browser_get("https://pornxp.ph/", timeout=20.0, follow_redirects=True) html = r.text print(f"len: {len(html)}") patterns = [ ("data-id alone", r'
]*>.*?([^<]+)
'), ("data-id+url+img+dur+title", r'
([^<]+)
'), ("full original", r'
]*>\s*]*>.*?(?P[^<]+)
.*?
(?P[^<]+)</div>'), ] for name, p in patterns: n = len(list(re.finditer(p, html, re.DOTALL | re.IGNORECASE))) print(f"{name}: {n}") # Bonus — peek at 5th card to see if anything quirky all_ids = re.findall(r'<div class="item preview"\s+data-id="(\d+)"', html) print(f"\nall card ids: {all_ids[:8]}...") # Find card 5 raw — by index positions = [m.start() for m in re.finditer(r'<div class="item preview"', html)] if len(positions) >= 5: print(f"\nCard #5 raw (first 500 chars):\n{html[positions[4]:positions[4]+500]!r}")