Resolver/perf: - find_by_phash_within: nearest match via Postgres bit_count over bit(64) XOR instead of Python scan of all phash fingerprints (~20x faster per scene; unblocks long delta runs that were killed mid-run before since advanced). Scheduler/reliability: - reap ingest_runs stuck in 'running' on worker startup (killed_by_restart). - smoke_test: per-source ingest health, stuck-run and browse-freshness checks -> Sentry; exclude killed_by_restart from the failed-run alarm. Tags (ingest with tags + fill blanks): - wire infer_tag_slugs into normalize_scene so tube scenes get title-inferred tags (was dead code); union with connector tags. - scripts/backfill_inferred_tags.py: keyset/batched/idempotent backfill for existing tagless scenes (playable tag coverage 16% -> ~52%). Clip-store: - skip ManyVids/IWantClips/Clips4Sale/... from canonical sources at ingest (GOON_SKIP_CLIP_STORE, default on) — permanent orphans, ~56% of canonical ingest, never have a free-tube playback source. Browse tubes: - enable fullmovies + hdporn.gg: studio parsed from title prefix instead of the /networks/ sidebar (which always yielded the first listed network); drop phash compute (pilot: 0% canonical hit within Hamming 5 — auto-screenshots), matching relies on title/performer/duration. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
132 lines
4.9 KiB
Python
132 lines
4.9 KiB
Python
"""fullmovies.xxx — latest-vids browse scraper.
|
|
|
|
Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos/<slug>/`,
|
|
`/networks/<slug>/`, `/models/<slug>/`, `/tags/<slug>/`. og:image to `img.fullmovies.xxx/...`
|
|
— **prawdopodobnie auto-screenshot** (jak hdporn.gg → 8% match). Probe potwierdzi.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
meta_content,
|
|
)
|
|
from app.normalize.text import slugify
|
|
|
|
_BASE = "https://www.fullmovies.xxx"
|
|
_SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
|
|
_NETWORK_LINK_RE = re.compile(
|
|
r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_MODEL_LINK_RE = re.compile(
|
|
r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_TAG_LINK_RE = re.compile(
|
|
r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
|
|
|
|
class FullmoviesScraper(BaseBrowseScraper):
|
|
sitetag = "fullmoviesxxx"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
if page <= 1:
|
|
return f"{_BASE}/latest-updates/"
|
|
return f"{_BASE}/latest-updates/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
url = m.group(1)
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
title = meta_content(detail_html, property="og:title")
|
|
if not title:
|
|
return None
|
|
title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip()
|
|
|
|
description = meta_content(detail_html, property="og:description")
|
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
|
|
|
duration_sec: int | None = None
|
|
dur_meta = meta_content(detail_html, property="video:duration")
|
|
if dur_meta and dur_meta.isdigit():
|
|
duration_sec = int(dur_meta)
|
|
|
|
# Studio z PREFIKSU tytułu ("Studio - Scene Title"), nie z sidebara /networks/.
|
|
# Sidebar listuje WSZYSTKIE sieci → `_NETWORK_LINK_RE.finditer().first()` zawsze
|
|
# zwracał pierwszą z listy (Brazzers) dla każdej sceny — mis-attribution. Tytuł
|
|
# po oczyszczeniu ma format "Studio - Opis" (np. "Fake Hostel - ...").
|
|
studio: RawStudio | None = None
|
|
if " - " in title:
|
|
studio_name = title.split(" - ", 1)[0].strip()
|
|
if studio_name and len(studio_name) <= 50:
|
|
studio = RawStudio(
|
|
external_id=f"fullmoviesxxx:studio:{slugify(studio_name)}",
|
|
name=studio_name,
|
|
slug=slugify(studio_name),
|
|
)
|
|
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if not name or slug in seen_perf or name.lower() in ("pornstars", "models"):
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name)
|
|
)
|
|
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if slug in seen_tag:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug))
|
|
|
|
# Phash WYŁĄCZONY (pilot 2026-06-01: 0% trafień ≤5, mediana Hamming 14 do
|
|
# canonical — auto-screenshoty img.fullmovies.xxx, nie hot-linkowane studio
|
|
# thumbnaile). Matching trzyma się na title+performer+duration (seed: 92% tagged),
|
|
# więc download thumbnaila pod phash to czysty narzut. thumbnail_url zostaje (display).
|
|
fingerprints: list[RawFingerprint] = []
|
|
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
description=description,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|