fix(latestpornvideo): revive search via /actor/ listing + metadata
Old regex matched junk (/wp-json etc.), not scenes (scenes are /<post_id>/). Frozen since 06-13. Rewrote search() to scrape the /actor/<slug>/ listing and parse <article> cards: scene URL, title, performers + tags from the class (actors-*/tag-*/category-*, dropping performer-name fragment tags), thumbnail. Studio + release date parsed from the "<Studio>-YYYY-MM-DD" thumbnail filename, with a title-prefix "<Studio> YY MM DD" fallback. Multi-performer works; no duration in listing; playback unchanged (hoster). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e77deef667
commit
b1a530611f
1 changed files with 169 additions and 8 deletions
|
|
@ -1,19 +1,180 @@
|
||||||
"""latestpornvideo.com — direct HTML scrape.
|
"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven).
|
||||||
|
|
||||||
Search: `https://latestpornvideo.com/page/<n>/?s=<q>`.
|
2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.),
|
||||||
Scene URL: `https://latestpornvideo.com/<slug>/`.
|
nie sceny. Sceny to `/<post_id>/` (numeryczne). Czytamy listing performera
|
||||||
|
`/actor/<slug>/` i parsujemy karty `<article>`.
|
||||||
|
|
||||||
|
Metadane z karty (listing, bez detail-fetcha):
|
||||||
|
- klasa `<article>`: `actors-<slug>` (multi) → performerzy; `tag-<slug>` (multi) +
|
||||||
|
`category-<slug>` → tagi (filtrujemy fragmenty imienia performera)
|
||||||
|
- `<a href title="...">` → URL sceny (/<id>/) + tytuł
|
||||||
|
- `data-main-thumb` → thumbnail; jego nazwa pliku koduje `<Studio>-YYYY-MM-DD-...`
|
||||||
|
→ wyłuskujemy studio + release_date (gdy pasuje wzorzec)
|
||||||
|
|
||||||
|
Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom`
|
||||||
|
(_embed_iframe → luluvid/hoster, phone-side).
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://latestpornvideo.com"
|
||||||
|
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
|
||||||
|
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
|
||||||
|
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
|
||||||
|
_CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)")
|
||||||
|
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
|
||||||
|
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
|
||||||
|
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg` (np. Analized-2021-01-09-Amirah-...).
|
||||||
|
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
|
||||||
|
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
|
||||||
|
# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger").
|
||||||
|
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
|
||||||
|
|
||||||
|
|
||||||
|
def _name_from_slug(slug: str) -> str:
|
||||||
|
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
||||||
|
|
||||||
|
|
||||||
class LatestPornVideoScraper(BaseSearchScraper):
|
class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
sitetag = "latestpornvideocom"
|
sitetag = "latestpornvideocom"
|
||||||
_search_url_template = "https://latestpornvideo.com/page/{page}/?s={query}"
|
|
||||||
_scene_url_re = re.compile(
|
def search(
|
||||||
r'href="(?P<url>https://latestpornvideo\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
self, query: str, *, page: int = 1, limit: int | None = None
|
||||||
re.IGNORECASE,
|
) -> Iterator[RawScene]:
|
||||||
)
|
actor_slug = slugify(query)
|
||||||
|
if not actor_slug:
|
||||||
|
return
|
||||||
|
url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=self._timeout)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
text = r.text
|
||||||
|
anchors = list(_ARTICLE_RE.finditer(text))
|
||||||
|
seen: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for idx, m in enumerate(anchors):
|
||||||
|
cls = m.group(1)
|
||||||
|
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
|
||||||
|
window = text[m.start():win_end]
|
||||||
|
|
||||||
|
link_m = _LINK_RE.search(window)
|
||||||
|
if not link_m:
|
||||||
|
continue
|
||||||
|
scene_url = link_m.group(1).rstrip("/") + "/"
|
||||||
|
if not scene_url.startswith(_BASE) or scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
title = html.unescape(link_m.group(2)).strip()
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
|
||||||
|
thumb_m = _THUMB_RE.search(window)
|
||||||
|
thumb = thumb_m.group(1) if thumb_m else None
|
||||||
|
|
||||||
|
# Performerzy z klasy.
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
perf_tokens: set[str] = set()
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for am in _CLASS_ACTOR_RE.finditer(cls):
|
||||||
|
sl = am.group(1)
|
||||||
|
if sl in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(sl)
|
||||||
|
perf_tokens.update(sl.split("-"))
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl))
|
||||||
|
)
|
||||||
|
if not performers:
|
||||||
|
perf_tokens.update(actor_slug.split("-"))
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip())
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera.
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)):
|
||||||
|
sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1))
|
||||||
|
if not sl or sl in seen_tag or sl in perf_tokens:
|
||||||
|
continue
|
||||||
|
seen_tag.add(sl)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
|
||||||
|
|
||||||
|
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
release_date: date | None = None
|
||||||
|
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
|
||||||
|
studio_raw = tn.group(1).replace("-", " ").strip()
|
||||||
|
# Pomiń gdy "studio" to w istocie imię performera.
|
||||||
|
if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||||||
|
name=studio_raw, slug=slugify(studio_raw),
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
release_date = date(int(tn.group(2)), int(tn.group(3)), int(tn.group(4)))
|
||||||
|
except ValueError:
|
||||||
|
release_date = None
|
||||||
|
|
||||||
|
# Fallback z tytułu: `<Studio> YY MM DD ...` gdy thumb nie dał studio/daty.
|
||||||
|
if studio is None or release_date is None:
|
||||||
|
if tm2 := _TITLE_DATE_RE.search(title):
|
||||||
|
if release_date is None:
|
||||||
|
try:
|
||||||
|
release_date = date(
|
||||||
|
2000 + int(tm2.group(2)), int(tm2.group(3)), int(tm2.group(4))
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
release_date = None
|
||||||
|
studio_raw = tm2.group(1).strip(" -–")
|
||||||
|
if (
|
||||||
|
studio is None and 2 <= len(studio_raw) <= 30
|
||||||
|
and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}
|
||||||
|
):
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||||||
|
name=studio_raw, slug=slugify(studio_raw),
|
||||||
|
)
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue