goon/app/connectors/direct_scrapers/latestpornvideo.py
jtrzupek f34a75f4c6 feat(ingest): disable hqfap/4k69 (broken playback), latestpornvideo → browse
- hqfap + 4k69: both ingested fresh but playback is dead (hqfap serves a fixed
  ~3MB "server down" stub for every scene; 4k69 resolves no playable URL).
  Removed from ALL_BROWSE_SCRAPERS so no new dead sources get ingested; existing
  live playback_sources marked dead in prod (scenes drop out of has_playback /
  Sites). Extractors kept in registry for easy re-enable if the hosts recover.
- latestpornvideo: was a performer-search scraper, so it never picked up the
  site's "latest" feed — users saw a stale set. Converted to a browse scraper
  reading /page/N/ (studio+date from title/thumb, category tags; performers via
  canonical merge). Moved DIRECT → BROWSE list.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-22 09:34:47 +02:00

180 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""latestpornvideo.com — latest-vids browse scraper.
Historia: dawniej performer-driven search scraper (`/actor/<slug>/`). Problem
(user-report 1da0375e): search-scraper ingestuje TYLKO sceny performerów, których
akurat szukamy → feed strony "latest" nigdy nie wpada, w apce widać stary zestaw,
a na stronie jest świeży. Przerobione na BROWSE (latest chronologicznie z
`/page/<n>/`, page 1 = `/`), 2026-06-22.
Listing card (zero detail-fetchy — detail page nie ma performerów ani duration):
<article class="... post-<id> ... category-<cat> tag-<x> tag-<y> ...">
<a href="https://latestpornvideo.com/<id>/" title="<Tytuł>">
data-main-thumb="<Studio>-YYYY-MM-DD-...-cover.jpg"
→ tytuł, miniatura, studio+release_date (z nazwy thumba albo z tytułu
"<Studio> YY MM DD ..."). Performerzy: listing ICH NIE MA czysto
(homepage karty bez `actors-*`, jak na stronach /actor/), a `tag-*` miesza
fragmenty imion z gatunkami → NIE ufamy tagom jako performerom; performera
dorabia canonical-merge po tytule+duration. Tagi bierzemy ostrożnie.
Playback: luluvid (filemoon family) iframe → extractor `latestpornvideocom`
(_embed_iframe → type='hoster'), telefon resolwuje phone-side. page_url = /<id>/.
"""
from __future__ import annotations
import html
import logging
import re
from datetime import date
from app.connectors.base import (
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://latestpornvideo.com"
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg`.
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
# Karty homepage zawsze siedzą w kategorii "latest-porn-videos" — to nie jest tag.
_CAT_SKIP = {"latest-porn-videos", "uncategorized", ""}
def _name_from_slug(slug: str) -> str:
return " ".join(w.capitalize() for w in slug.split("-") if w)
class LatestPornVideoScraper(BaseBrowseScraper):
sitetag = "latestpornvideocom"
def _listing_url(self, page: int) -> str:
return _BASE + "/" if page <= 1 else f"{_BASE}/page/{page}/"
# crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji.
def _extract_scene_urls(self, listing_html: str) -> list[str]:
return [m.group(1) for m in _LINK_RE.finditer(listing_html)]
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
return None
def crawl_page(self, page: int) -> list[RawScene] | None:
url = self._listing_url(page)
try:
res = browser_get(url, timeout=self._timeout)
text = res.text if hasattr(res, "text") else res
except Exception as e:
log.warning("latestpornvideo browse listing fetch failed (page %d): %s", page, e)
return None
anchors = list(_ARTICLE_RE.finditer(text))
out: list[RawScene] = []
seen: set[str] = set()
for idx, m in enumerate(anchors):
cls = m.group(1)
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
window = text[m.start():win_end]
link_m = _LINK_RE.search(window)
if not link_m:
continue
scene_url = link_m.group(1).rstrip("/") + "/"
# tylko właściwe posty scen (/<digits>/), bez nav/kategorii
if not re.fullmatch(rf"{re.escape(_BASE)}/\d+/", scene_url) or scene_url in seen:
continue
seen.add(scene_url)
title = html.unescape(link_m.group(2)).strip()
if not title:
continue
thumb_m = _THUMB_RE.search(window)
thumb = thumb_m.group(1) if thumb_m else None
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
studio: RawStudio | None = None
release_date: date | None = None
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
studio_raw = tn.group(1).replace("-", " ").strip()
if studio_raw:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw),
)
try:
release_date = date(int(tn.group(2)), int(tn.group(3)), int(tn.group(4)))
except ValueError:
release_date = None
# Fallback z tytułu: `<Studio> YY MM DD ...`.
if studio is None or release_date is None:
if tm2 := _TITLE_DATE_RE.search(title):
if release_date is None:
try:
release_date = date(
2000 + int(tm2.group(2)), int(tm2.group(3)), int(tm2.group(4))
)
except ValueError:
release_date = None
studio_raw = tm2.group(1).strip(" -")
if studio is None and 2 <= len(studio_raw) <= 30:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw),
)
# Tagi: tylko prawdziwe kategorie (category-*), bez "latest-porn-videos".
# `tag-*` POMIJAMY — to mieszanka fragmentów imion performerów i gatunków,
# bez `actors-*` (jak na /actor/) nie da się ich rozdzielić → byłby szum.
tags: list[RawTag] = []
seen_tag: set[str] = set()
for cm in _CLASS_CAT_RE.finditer(cls):
sl = cm.group(1)
if sl in _CAT_SKIP or sl in seen_tag:
continue
seen_tag.add(sl)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
fingerprints = []
if thumb:
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
from app.connectors.base import RawFingerprint
fingerprints.append(RawFingerprint(kind="phash", value=ph))
out.append(
RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
release_date=release_date,
url=scene_url,
studio=studio,
performers=[],
tags=tags,
fingerprints=fingerprints,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
thumbnail_url=thumb,
)
],
)
)
log.info("latestpornvideo browse page %d: %d scenes", page, len(out))
return out