- hqfap + 4k69: both ingested fresh but playback is dead (hqfap serves a fixed ~3MB "server down" stub for every scene; 4k69 resolves no playable URL). Removed from ALL_BROWSE_SCRAPERS so no new dead sources get ingested; existing live playback_sources marked dead in prod (scenes drop out of has_playback / Sites). Extractors kept in registry for easy re-enable if the hosts recover. - latestpornvideo: was a performer-search scraper, so it never picked up the site's "latest" feed — users saw a stale set. Converted to a browse scraper reading /page/N/ (studio+date from title/thumb, category tags; performers via canonical merge). Moved DIRECT → BROWSE list. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
180 lines
7.6 KiB
Python
180 lines
7.6 KiB
Python
"""latestpornvideo.com — latest-vids browse scraper.
|
||
|
||
Historia: dawniej performer-driven search scraper (`/actor/<slug>/`). Problem
|
||
(user-report 1da0375e): search-scraper ingestuje TYLKO sceny performerów, których
|
||
akurat szukamy → feed strony "latest" nigdy nie wpada, w apce widać stary zestaw,
|
||
a na stronie jest świeży. Przerobione na BROWSE (latest chronologicznie z
|
||
`/page/<n>/`, page 1 = `/`), 2026-06-22.
|
||
|
||
Listing card (zero detail-fetchy — detail page nie ma performerów ani duration):
|
||
<article class="... post-<id> ... category-<cat> tag-<x> tag-<y> ...">
|
||
<a href="https://latestpornvideo.com/<id>/" title="<Tytuł>">
|
||
data-main-thumb="<Studio>-YYYY-MM-DD-...-cover.jpg"
|
||
→ tytuł, miniatura, studio+release_date (z nazwy thumba albo z tytułu
|
||
"<Studio> YY MM DD ..."). Performerzy: listing ICH NIE MA czysto
|
||
(homepage karty bez `actors-*`, jak na stronach /actor/), a `tag-*` miesza
|
||
fragmenty imion z gatunkami → NIE ufamy tagom jako performerom; performera
|
||
dorabia canonical-merge po tytule+duration. Tagi bierzemy ostrożnie.
|
||
|
||
Playback: luluvid (filemoon family) iframe → extractor `latestpornvideocom`
|
||
(_embed_iframe → type='hoster'), telefon resolwuje phone-side. page_url = /<id>/.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import html
|
||
import logging
|
||
import re
|
||
from datetime import date
|
||
|
||
from app.connectors.base import (
|
||
RawPlaybackSource,
|
||
RawScene,
|
||
RawStudio,
|
||
RawTag,
|
||
)
|
||
from app.connectors.direct_scrapers._browse_base import (
|
||
BaseBrowseScraper,
|
||
compute_thumbnail_phash,
|
||
)
|
||
from app.extractors import browser_get
|
||
from app.normalize.text import slugify
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
_BASE = "https://latestpornvideo.com"
|
||
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
|
||
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
|
||
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
|
||
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
|
||
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
|
||
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg`.
|
||
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
|
||
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
|
||
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
|
||
# Karty homepage zawsze siedzą w kategorii "latest-porn-videos" — to nie jest tag.
|
||
_CAT_SKIP = {"latest-porn-videos", "uncategorized", ""}
|
||
|
||
|
||
def _name_from_slug(slug: str) -> str:
|
||
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
||
|
||
|
||
class LatestPornVideoScraper(BaseBrowseScraper):
|
||
sitetag = "latestpornvideocom"
|
||
|
||
def _listing_url(self, page: int) -> str:
|
||
return _BASE + "/" if page <= 1 else f"{_BASE}/page/{page}/"
|
||
|
||
# crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji.
|
||
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||
return [m.group(1) for m in _LINK_RE.finditer(listing_html)]
|
||
|
||
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||
return None
|
||
|
||
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||
url = self._listing_url(page)
|
||
try:
|
||
res = browser_get(url, timeout=self._timeout)
|
||
text = res.text if hasattr(res, "text") else res
|
||
except Exception as e:
|
||
log.warning("latestpornvideo browse listing fetch failed (page %d): %s", page, e)
|
||
return None
|
||
|
||
anchors = list(_ARTICLE_RE.finditer(text))
|
||
out: list[RawScene] = []
|
||
seen: set[str] = set()
|
||
for idx, m in enumerate(anchors):
|
||
cls = m.group(1)
|
||
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
|
||
window = text[m.start():win_end]
|
||
|
||
link_m = _LINK_RE.search(window)
|
||
if not link_m:
|
||
continue
|
||
scene_url = link_m.group(1).rstrip("/") + "/"
|
||
# tylko właściwe posty scen (/<digits>/), bez nav/kategorii
|
||
if not re.fullmatch(rf"{re.escape(_BASE)}/\d+/", scene_url) or scene_url in seen:
|
||
continue
|
||
seen.add(scene_url)
|
||
title = html.unescape(link_m.group(2)).strip()
|
||
if not title:
|
||
continue
|
||
|
||
thumb_m = _THUMB_RE.search(window)
|
||
thumb = thumb_m.group(1) if thumb_m else None
|
||
|
||
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
|
||
studio: RawStudio | None = None
|
||
release_date: date | None = None
|
||
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
|
||
studio_raw = tn.group(1).replace("-", " ").strip()
|
||
if studio_raw:
|
||
studio = RawStudio(
|
||
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||
name=studio_raw, slug=slugify(studio_raw),
|
||
)
|
||
try:
|
||
release_date = date(int(tn.group(2)), int(tn.group(3)), int(tn.group(4)))
|
||
except ValueError:
|
||
release_date = None
|
||
|
||
# Fallback z tytułu: `<Studio> YY MM DD ...`.
|
||
if studio is None or release_date is None:
|
||
if tm2 := _TITLE_DATE_RE.search(title):
|
||
if release_date is None:
|
||
try:
|
||
release_date = date(
|
||
2000 + int(tm2.group(2)), int(tm2.group(3)), int(tm2.group(4))
|
||
)
|
||
except ValueError:
|
||
release_date = None
|
||
studio_raw = tm2.group(1).strip(" -–")
|
||
if studio is None and 2 <= len(studio_raw) <= 30:
|
||
studio = RawStudio(
|
||
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||
name=studio_raw, slug=slugify(studio_raw),
|
||
)
|
||
|
||
# Tagi: tylko prawdziwe kategorie (category-*), bez "latest-porn-videos".
|
||
# `tag-*` POMIJAMY — to mieszanka fragmentów imion performerów i gatunków,
|
||
# bez `actors-*` (jak na /actor/) nie da się ich rozdzielić → byłby szum.
|
||
tags: list[RawTag] = []
|
||
seen_tag: set[str] = set()
|
||
for cm in _CLASS_CAT_RE.finditer(cls):
|
||
sl = cm.group(1)
|
||
if sl in _CAT_SKIP or sl in seen_tag:
|
||
continue
|
||
seen_tag.add(sl)
|
||
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
|
||
|
||
fingerprints = []
|
||
if thumb:
|
||
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||
if ph:
|
||
from app.connectors.base import RawFingerprint
|
||
|
||
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||
|
||
out.append(
|
||
RawScene(
|
||
external_id=f"{self.sitetag}:{scene_url}",
|
||
title=title,
|
||
release_date=release_date,
|
||
url=scene_url,
|
||
studio=studio,
|
||
performers=[],
|
||
tags=tags,
|
||
fingerprints=fingerprints,
|
||
playback_sources=[
|
||
RawPlaybackSource(
|
||
origin=f"tube:{self.sitetag}",
|
||
page_url=scene_url,
|
||
thumbnail_url=thumb,
|
||
)
|
||
],
|
||
)
|
||
)
|
||
|
||
log.info("latestpornvideo browse page %d: %d scenes", page, len(out))
|
||
return out
|