diff --git a/app/connectors/direct_scrapers/latestpornvideo.py b/app/connectors/direct_scrapers/latestpornvideo.py index 2e6704c..e498e05 100644 --- a/app/connectors/direct_scrapers/latestpornvideo.py +++ b/app/connectors/direct_scrapers/latestpornvideo.py @@ -1,19 +1,180 @@ -"""latestpornvideo.com — direct HTML scrape. +"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven). -Search: `https://latestpornvideo.com/page//?s=`. -Scene URL: `https://latestpornvideo.com//`. +2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.), +nie sceny. Sceny to `//` (numeryczne). Czytamy listing performera +`/actor//` i parsujemy karty `
`. + +Metadane z karty (listing, bez detail-fetcha): + - klasa `
`: `actors-` (multi) → performerzy; `tag-` (multi) + + `category-` → tagi (filtrujemy fragmenty imienia performera) + - `` → URL sceny (//) + tytuł + - `data-main-thumb` → thumbnail; jego nazwa pliku koduje `-YYYY-MM-DD-...` + → wyłuskujemy studio + release_date (gdy pasuje wzorzec) + +Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom` +(_embed_iframe → luluvid/hoster, phone-side). """ from __future__ import annotations +import html +import logging import re +from collections.abc import Iterator +from datetime import date +from app.connectors.base import ( + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://latestpornvideo.com" +_ARTICLE_RE = re.compile(r']*\bclass="([^"]+)"', re.IGNORECASE) +_LINK_RE = re.compile(r'-YYYY-MM-DD--cover.jpg` (np. Analized-2021-01-09-Amirah-...). +_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE) +# Tytuł: ` YY MM DD ` (np. "MySexMobile 20 10 23 Abella Danger"). +# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger"). +_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b") + + +def _name_from_slug(slug: str) -> str: + return " ".join(w.capitalize() for w in slug.split("-") if w) class LatestPornVideoScraper(BaseSearchScraper): sitetag = "latestpornvideocom" - _search_url_template = "https://latestpornvideo.com/page/{page}/?s={query}" - _scene_url_re = re.compile( - r'href="(?Phttps://latestpornvideo\.com/(?P[a-z0-9][a-z0-9\-]+))/"', - re.IGNORECASE, - ) + + def search( + self, query: str, *, page: int = 1, limit: int | None = None + ) -> Iterator[RawScene]: + actor_slug = slugify(query) + if not actor_slug: + return + url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") + try: + r = browser_get(url, timeout=self._timeout) + except Exception as e: + log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e) + return + if r.status_code != 200: + return + + text = r.text + anchors = list(_ARTICLE_RE.finditer(text)) + seen: set[str] = set() + yielded = 0 + for idx, m in enumerate(anchors): + cls = m.group(1) + win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500 + window = text[m.start():win_end] + + link_m = _LINK_RE.search(window) + if not link_m: + continue + scene_url = link_m.group(1).rstrip("/") + "/" + if not scene_url.startswith(_BASE) or scene_url in seen: + continue + seen.add(scene_url) + title = html.unescape(link_m.group(2)).strip() + if not title: + continue + + thumb_m = _THUMB_RE.search(window) + thumb = thumb_m.group(1) if thumb_m else None + + # Performerzy z klasy. + performers: list[RawPerformer] = [] + perf_tokens: set[str] = set() + seen_perf: set[str] = set() + for am in _CLASS_ACTOR_RE.finditer(cls): + sl = am.group(1) + if sl in seen_perf: + continue + seen_perf.add(sl) + perf_tokens.update(sl.split("-")) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl)) + ) + if not performers: + perf_tokens.update(actor_slug.split("-")) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip()) + ) + + # Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera. + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)): + sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1)) + if not sl or sl in seen_tag or sl in perf_tokens: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl)) + + # Studio + release_date z nazwy thumba (`-YYYY-MM-DD-`). + studio: RawStudio | None = None + release_date: date | None = None + if thumb and (tn := _THUMB_NAME_RE.search(thumb)): + studio_raw = tn.group(1).replace("-", " ").strip() + # Pomiń gdy "studio" to w istocie imię performera. + if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", + name=studio_raw, slug=slugify(studio_raw), + ) + try: + release_date = date(int(tn.group(2)), int(tn.group(3)), int(tn.group(4))) + except ValueError: + release_date = None + + # Fallback z tytułu: ` YY MM DD ...` gdy thumb nie dał studio/daty. + if studio is None or release_date is None: + if tm2 := _TITLE_DATE_RE.search(title): + if release_date is None: + try: + release_date = date( + 2000 + int(tm2.group(2)), int(tm2.group(3)), int(tm2.group(4)) + ) + except ValueError: + release_date = None + studio_raw = tm2.group(1).strip(" -–") + if ( + studio is None and 2 <= len(studio_raw) <= 30 + and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers} + ): + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", + name=studio_raw, slug=slugify(studio_raw), + ) + + yield RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + release_date=release_date, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + thumbnail_url=thumb, + ) + ], + ) + yielded += 1 + if limit is not None and yielded >= limit: + return