"""fullmovies.xxx — latest-vids browse scraper. Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos//`, `/networks//`, `/models//`, `/tags//`. og:image to `img.fullmovies.xxx/...` — **prawdopodobnie auto-screenshot** (jak hdporn.gg → 8% match). Probe potwierdzi. """ from __future__ import annotations import re from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, meta_content, ) _BASE = "https://www.fullmovies.xxx" _SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE) _NETWORK_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _MODEL_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) class FullmoviesScraper(BaseBrowseScraper): sitetag = "fullmoviesxxx" def _listing_url(self, page: int) -> str: if page <= 1: return f"{_BASE}/latest-updates/" return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = m.group(1) if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: title = meta_content(detail_html, property="og:title") if not title: return None title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip() description = meta_content(detail_html, property="og:description") thumbnail_url = meta_content(detail_html, property="og:image") duration_sec: int | None = None dur_meta = meta_content(detail_html, property="video:duration") if dur_meta and dur_meta.isdigit(): duration_sec = int(dur_meta) studio: RawStudio | None = None for m in _NETWORK_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if name.lower() in ("networks", ""): continue studio = RawStudio( external_id=f"fullmoviesxxx:network:{slug}", name=name, slug=slug, ) break performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _MODEL_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_perf or name.lower() in ("pornstars", "models"): continue seen_perf.add(slug) performers.append( RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name) ) tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _TAG_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_tag: continue seen_tag.add(slug) tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug)) fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )