goon/app/connectors/direct_scrapers/fullmovies.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

129 lines
4.4 KiB
Python

"""fullmovies.xxx — latest-vids browse scraper.
Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos/<slug>/`,
`/networks/<slug>/`, `/models/<slug>/`, `/tags/<slug>/`. og:image to `img.fullmovies.xxx/...`
— **prawdopodobnie auto-screenshot** (jak hdporn.gg → 8% match). Probe potwierdzi.
"""
from __future__ import annotations
import re
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://www.fullmovies.xxx"
_SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
_NETWORK_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_MODEL_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
class FullmoviesScraper(BaseBrowseScraper):
sitetag = "fullmoviesxxx"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/latest-updates/"
return f"{_BASE}/latest-updates/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
title = meta_content(detail_html, property="og:title")
if not title:
return None
title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip()
description = meta_content(detail_html, property="og:description")
thumbnail_url = meta_content(detail_html, property="og:image")
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
studio: RawStudio | None = None
for m in _NETWORK_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if name.lower() in ("networks", ""):
continue
studio = RawStudio(
external_id=f"fullmoviesxxx:network:{slug}",
name=name,
slug=slug,
)
break
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_perf or name.lower() in ("pornstars", "models"):
continue
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name)
)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug))
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)