"""hdporn.gg — latest-vids browse scraper. Engine podobny do freshporno: `/videos//` URL, `/networks//` = studio, `/models//` = performer, `/tags//` = tag. Quirk: og:image to internal CDN `img.hdporn.gg/...` — przed merging do prod sprawdzamy phash distance (gate-keeper: jeśli Hamming >5 dla >70% scen → orphan factory, wyłącz; analogia do shyfap). """ from __future__ import annotations import re from urllib.parse import urljoin from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, meta_content, ) _BASE = "https://www.hdporn.gg" _SCENE_URL_RE = re.compile(r'href="(https://www\.hdporn\.gg/videos/[a-z0-9\-]+/)"', re.IGNORECASE) _NETWORK_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _MODEL_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="https://www\.hdporn\.gg/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) class HDPornGGScraper(BaseBrowseScraper): sitetag = "hdporngg" def _listing_url(self, page: int) -> str: if page <= 1: return f"{_BASE}/latest-updates/" return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = m.group(1) if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: title = meta_content(detail_html, property="og:title") if not title: return None # og:title typowo zawiera ": Free HD Porn" suffix — usuń title = re.sub(r":\s*Free HD Porn\s*$", "", title, flags=re.IGNORECASE).strip() # I "Brazzers - " prefix często też w title — zostaw, bo studio name w title # to silny sygnał dla fuzzy match. description = meta_content(detail_html, property="og:description") thumbnail_url = meta_content(detail_html, property="og:image") duration_sec: int | None = None dur_meta = meta_content(detail_html, property="video:duration") if dur_meta and dur_meta.isdigit(): duration_sec = int(dur_meta) # Studio z /networks/. Skip nav anchors typu "Networks" / "Pornstars". studio: RawStudio | None = None for m in _NETWORK_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if name.lower() in ("networks", ""): continue # Pierwszy NETWORK link w body to studio sceny (nav sidebar też ma networks # listę — bierzemy gdy `class="btn_sponsor_group"` lub po prostu pierwszy # NIE z sidebara). hdporn.gg pokazuje btn_sponsor_group w main scene area. studio = RawStudio( external_id=f"hdporngg:network:{slug}", name=name, slug=slug, ) break performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _MODEL_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_perf or name.lower() in ("pornstars", "models"): continue seen_perf.add(slug) performers.append( RawPerformer(external_id=f"hdporngg:model:{slug}", name=name) ) tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _TAG_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_tag: continue seen_tag.add(slug) tags.append( RawTag(external_id=f"hdporngg:tag:{slug}", name=name, slug=slug) ) fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )