"""hdporn.gg — latest-vids browse scraper. Engine podobny do freshporno: `/videos//` URL, `/networks//` = studio, `/models//` = performer, `/tags//` = tag. Quirk: og:image to internal CDN `img.hdporn.gg/...` — przed merging do prod sprawdzamy phash distance (gate-keeper: jeśli Hamming >5 dla >70% scen → orphan factory, wyłącz; analogia do shyfap). """ from __future__ import annotations import re from urllib.parse import urljoin from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, meta_content, ) from app.normalize.text import slugify _BASE = "https://www.hdporn.gg" _SCENE_URL_RE = re.compile(r'href="(https://www\.hdporn\.gg/videos/[a-z0-9\-]+/)"', re.IGNORECASE) _NETWORK_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _MODEL_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) class HDPornGGScraper(BaseBrowseScraper): sitetag = "hdporngg" def _listing_url(self, page: int) -> str: if page <= 1: return f"{_BASE}/latest-updates/" return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = m.group(1) if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: title = meta_content(detail_html, property="og:title") if not title: return None # og:title typowo zawiera ": Free HD Porn" suffix — usuń title = re.sub(r":\s*Free HD Porn\s*$", "", title, flags=re.IGNORECASE).strip() # I "Brazzers - " prefix często też w title — zostaw, bo studio name w title # to silny sygnał dla fuzzy match. description = meta_content(detail_html, property="og:description") thumbnail_url = meta_content(detail_html, property="og:image") duration_sec: int | None = None dur_meta = meta_content(detail_html, property="video:duration") if dur_meta and dur_meta.isdigit(): duration_sec = int(dur_meta) # Studio z PREFIKSU tytułu ("Studio - Scene Title"), nie z sidebara /networks/. # Sidebar listuje WSZYSTKIE sieci → pierwszy match zawsze ten sam (Brazzers) dla # każdej sceny. Tytuł po oczyszczeniu ma format "Studio - Opis" (np. "Dad Crush - ..."). studio: RawStudio | None = None if " - " in title: studio_name = title.split(" - ", 1)[0].strip() if studio_name and len(studio_name) <= 50: studio = RawStudio( external_id=f"hdporngg:studio:{slugify(studio_name)}", name=studio_name, slug=slugify(studio_name), ) performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _MODEL_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if not name or slug in seen_perf or name.lower() in ("pornstars", "models"): continue seen_perf.add(slug) performers.append( RawPerformer(external_id=f"hdporngg:model:{slug}", name=name) ) tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _TAG_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_tag: continue seen_tag.add(slug) tags.append( RawTag(external_id=f"hdporngg:tag:{slug}", name=name, slug=slug) ) # Phash WYŁĄCZONY (pilot 2026-06-01: 0% trafień ≤5, mediana Hamming 14 do # canonical — auto-screenshoty img.hdporn.gg, nie hot-linkowane studio thumbnaile). # Matching trzyma się na title+performer+duration (seed: 92% tagged), więc download # thumbnaila pod phash to czysty narzut. thumbnail_url zostaje (display). fingerprints: list[RawFingerprint] = [] playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )