goon/app/connectors/direct_scrapers/latestpornvideo.py
jtrzupek b1a530611f fix(latestpornvideo): revive search via /actor/ listing + metadata
Old regex matched junk (/wp-json etc.), not scenes (scenes are /<post_id>/).
Frozen since 06-13. Rewrote search() to scrape the /actor/<slug>/ listing
and parse <article> cards: scene URL, title, performers + tags from the
class (actors-*/tag-*/category-*, dropping performer-name fragment tags),
thumbnail. Studio + release date parsed from the "<Studio>-YYYY-MM-DD"
thumbnail filename, with a title-prefix "<Studio> YY MM DD" fallback.
Multi-performer works; no duration in listing; playback unchanged (hoster).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 23:20:02 +02:00

180 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven).
2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.),
nie sceny. Sceny to `/<post_id>/` (numeryczne). Czytamy listing performera
`/actor/<slug>/` i parsujemy karty `<article>`.
Metadane z karty (listing, bez detail-fetcha):
- klasa `<article>`: `actors-<slug>` (multi) → performerzy; `tag-<slug>` (multi) +
`category-<slug>` → tagi (filtrujemy fragmenty imienia performera)
- `<a href title="...">` → URL sceny (/<id>/) + tytuł
- `data-main-thumb` → thumbnail; jego nazwa pliku koduje `<Studio>-YYYY-MM-DD-...`
→ wyłuskujemy studio + release_date (gdy pasuje wzorzec)
Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom`
(_embed_iframe → luluvid/hoster, phone-side).
"""
from __future__ import annotations
import html
import logging
import re
from collections.abc import Iterator
from datetime import date
from app.connectors.base import (
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__)
_BASE = "https://latestpornvideo.com"
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
_CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)")
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg` (np. Analized-2021-01-09-Amirah-...).
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger").
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
def _name_from_slug(slug: str) -> str:
return " ".join(w.capitalize() for w in slug.split("-") if w)
class LatestPornVideoScraper(BaseSearchScraper):
sitetag = "latestpornvideocom"
def search(
self, query: str, *, page: int = 1, limit: int | None = None
) -> Iterator[RawScene]:
actor_slug = slugify(query)
if not actor_slug:
return
url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
try:
r = browser_get(url, timeout=self._timeout)
except Exception as e:
log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e)
return
if r.status_code != 200:
return
text = r.text
anchors = list(_ARTICLE_RE.finditer(text))
seen: set[str] = set()
yielded = 0
for idx, m in enumerate(anchors):
cls = m.group(1)
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
window = text[m.start():win_end]
link_m = _LINK_RE.search(window)
if not link_m:
continue
scene_url = link_m.group(1).rstrip("/") + "/"
if not scene_url.startswith(_BASE) or scene_url in seen:
continue
seen.add(scene_url)
title = html.unescape(link_m.group(2)).strip()
if not title:
continue
thumb_m = _THUMB_RE.search(window)
thumb = thumb_m.group(1) if thumb_m else None
# Performerzy z klasy.
performers: list[RawPerformer] = []
perf_tokens: set[str] = set()
seen_perf: set[str] = set()
for am in _CLASS_ACTOR_RE.finditer(cls):
sl = am.group(1)
if sl in seen_perf:
continue
seen_perf.add(sl)
perf_tokens.update(sl.split("-"))
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl))
)
if not performers:
perf_tokens.update(actor_slug.split("-"))
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip())
)
# Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera.
tags: list[RawTag] = []
seen_tag: set[str] = set()
for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)):
sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1))
if not sl or sl in seen_tag or sl in perf_tokens:
continue
seen_tag.add(sl)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
studio: RawStudio | None = None
release_date: date | None = None
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
studio_raw = tn.group(1).replace("-", " ").strip()
# Pomiń gdy "studio" to w istocie imię performera.
if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}:
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw),
)
try:
release_date = date(int(tn.group(2)), int(tn.group(3)), int(tn.group(4)))
except ValueError:
release_date = None
# Fallback z tytułu: `<Studio> YY MM DD ...` gdy thumb nie dał studio/daty.
if studio is None or release_date is None:
if tm2 := _TITLE_DATE_RE.search(title):
if release_date is None:
try:
release_date = date(
2000 + int(tm2.group(2)), int(tm2.group(3)), int(tm2.group(4))
)
except ValueError:
release_date = None
studio_raw = tm2.group(1).strip(" -")
if (
studio is None and 2 <= len(studio_raw) <= 30
and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}
):
studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw),
)
yield RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
release_date=release_date,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
thumbnail_url=thumb,
)
],
)
yielded += 1
if limit is not None and yielded >= limit:
return