"""fullmovies.xxx — latest-vids browse scraper. Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos//`, `/networks//`, `/models//`, `/tags//`. og:image to `img.fullmovies.xxx/...` — **prawdopodobnie auto-screenshot** (jak hdporn.gg → 8% match). Probe potwierdzi. """ from __future__ import annotations import re from app.connectors.base import ( RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, meta_content, ) from app.normalize.text import slugify _BASE = "https://www.fullmovies.xxx" _SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE) _NETWORK_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _MODEL_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE ) class FullmoviesScraper(BaseBrowseScraper): sitetag = "fullmoviesxxx" def _listing_url(self, page: int) -> str: if page <= 1: return f"{_BASE}/latest-updates/" return f"{_BASE}/latest-updates/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): url = m.group(1) if url in seen: continue seen.add(url) out.append(url) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: title = meta_content(detail_html, property="og:title") if not title: return None title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip() description = meta_content(detail_html, property="og:description") thumbnail_url = meta_content(detail_html, property="og:image") duration_sec: int | None = None dur_meta = meta_content(detail_html, property="video:duration") if dur_meta and dur_meta.isdigit(): duration_sec = int(dur_meta) # Studio z PREFIKSU tytułu ("Studio - Scene Title"), nie z sidebara /networks/. # Sidebar listuje WSZYSTKIE sieci → `_NETWORK_LINK_RE.finditer().first()` zawsze # zwracał pierwszą z listy (Brazzers) dla każdej sceny — mis-attribution. Tytuł # po oczyszczeniu ma format "Studio - Opis" (np. "Fake Hostel - ..."). studio: RawStudio | None = None if " - " in title: studio_name = title.split(" - ", 1)[0].strip() if studio_name and len(studio_name) <= 50: studio = RawStudio( external_id=f"fullmoviesxxx:studio:{slugify(studio_name)}", name=studio_name, slug=slugify(studio_name), ) performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _MODEL_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if not name or slug in seen_perf or name.lower() in ("pornstars", "models"): continue seen_perf.add(slug) performers.append( RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name) ) tags: list[RawTag] = [] seen_tag: set[str] = set() for m in _TAG_LINK_RE.finditer(detail_html): slug, name = m.group(1), m.group(2).strip() if slug in seen_tag: continue seen_tag.add(slug) tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug)) # Phash WYŁĄCZONY (pilot 2026-06-01: 0% trafień ≤5, mediana Hamming 14 do # canonical — auto-screenshoty img.fullmovies.xxx, nie hot-linkowane studio # thumbnaile). Matching trzyma się na title+performer+duration (seed: 92% tagged), # więc download thumbnaila pod phash to czysty narzut. thumbnail_url zostaje (display). fingerprints: list[RawFingerprint] = [] playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )