diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 09669f2..bbacf88 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -101,7 +101,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan. # Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek # match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled. - LatestPornVideoScraper, + # LatestPornVideoScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22, + # user 1da0375e: search-driven nie brał feedu "latest" → stary zestaw w apce). # LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0% # canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany # usunięte (~15k scen). @@ -148,8 +149,8 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 -from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402 -from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402 +from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable +from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402 from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 @@ -157,6 +158,12 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ FreshpornoScraper, + # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven + # nie brał feedu "latest"). Listing card: tytuł (z embedded " YY MM DD"), + # thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma + # czysto (brak `actors-*`) → puste, dorabia canonical-merge. Playback: luluvid + # iframe → extractor latestpornvideocom (_embed_iframe) → telefon resolwuje. + LatestPornVideoScraper, # SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski # zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page//`. # Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/ @@ -207,18 +214,17 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie # full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.) XVideosBrowseScraper, - # HQFapScraper — dołączony 2026-06-10 (user request). PlayTube CMS, ~120k scen - # (re-upload pornhd.pet). JSON-LD VideoObject (title+uploadDate+duration+thumb+ - # contentUrl) + pille Pornstars/Categories na detail page. Listing nie paginuje - # się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4 - # (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`. - HQFapScraper, - # FourK69Scraper — dołączony 2026-06-10 (user request). Probe 2026-06-01 odrzucił - # po homepage "JS-rendered" — błędnie: scene pages mają pełny SSR + JSON-LD. Ta sama - # platforma PlayTube co hqfap (wspólna baza _playtube.py), ~65k scen, content głównie - # studyjny (4K paysite re-upload). Studio z kategorii matchowanych do listy /studios. - # Stream get_file (www.4kporno.xxx) jak fullmovies → mobile_direct, skip 2160p. - FourK69Scraper, + # HQFapScraper / FourK69Scraper — WYŁĄCZONE 2026-06-22 (user request, na razie). + # Oba na PlayTube CMS, ingestowały świeżo i wyglądały żywo, ALE playback w obu padł: + # - hqfap: hosting migrował na `/upload/videos/video_down.mp4` = STAŁY ~3MB stub + # "server down" dla KAŻDEJ sceny (extractor go odrzuca → None), + # - 4k69: get_file nie zwraca już grywalnego URL (extractor resolves nothing → None). + # Scena bez grywalnego źródła = śmieciowy wpis, więc nie ingestujemy nowych. Istniejące + # live playback_sources oznaczone dead na prodzie (znikają z /sources + has_playback). + # Reversible: odkomentuj + odżyw sources gdy hosting wróci. Extractory zostają w + # _REGISTRY (hqfapcom/4k69com) — gotowe gdyby content wrócił. + # HQFapScraper, + # FourK69Scraper, # NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/ # porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration # meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa diff --git a/app/connectors/direct_scrapers/latestpornvideo.py b/app/connectors/direct_scrapers/latestpornvideo.py index e498e05..0983d81 100644 --- a/app/connectors/direct_scrapers/latestpornvideo.py +++ b/app/connectors/direct_scrapers/latestpornvideo.py @@ -1,35 +1,41 @@ -"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven). +"""latestpornvideo.com — latest-vids browse scraper. -2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.), -nie sceny. Sceny to `//` (numeryczne). Czytamy listing performera -`/actor//` i parsujemy karty `
`. +Historia: dawniej performer-driven search scraper (`/actor//`). Problem +(user-report 1da0375e): search-scraper ingestuje TYLKO sceny performerów, których +akurat szukamy → feed strony "latest" nigdy nie wpada, w apce widać stary zestaw, +a na stronie jest świeży. Przerobione na BROWSE (latest chronologicznie z +`/page//`, page 1 = `/`), 2026-06-22. -Metadane z karty (listing, bez detail-fetcha): - - klasa `
`: `actors-` (multi) → performerzy; `tag-` (multi) + - `category-` → tagi (filtrujemy fragmenty imienia performera) - - `` → URL sceny (//) + tytuł - - `data-main-thumb` → thumbnail; jego nazwa pliku koduje `-YYYY-MM-DD-...` - → wyłuskujemy studio + release_date (gdy pasuje wzorzec) +Listing card (zero detail-fetchy — detail page nie ma performerów ani duration): +
+ + data-main-thumb="-YYYY-MM-DD-...-cover.jpg" +→ tytuł, miniatura, studio+release_date (z nazwy thumba albo z tytułu + " YY MM DD ..."). Performerzy: listing ICH NIE MA czysto + (homepage karty bez `actors-*`, jak na stronach /actor/), a `tag-*` miesza + fragmenty imion z gatunkami → NIE ufamy tagom jako performerom; performera + dorabia canonical-merge po tytule+duration. Tagi bierzemy ostrożnie. -Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom` -(_embed_iframe → luluvid/hoster, phone-side). +Playback: luluvid (filemoon family) iframe → extractor `latestpornvideocom` +(_embed_iframe → type='hoster'), telefon resolwuje phone-side. page_url = //. """ from __future__ import annotations import html import logging import re -from collections.abc import Iterator from datetime import date from app.connectors.base import ( - RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) -from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) from app.extractors import browser_get from app.normalize.text import slugify @@ -39,42 +45,45 @@ _BASE = "https://latestpornvideo.com" _ARTICLE_RE = re.compile(r']*\bclass="([^"]+)"', re.IGNORECASE) _LINK_RE = re.compile(r'-YYYY-MM-DD--cover.jpg` (np. Analized-2021-01-09-Amirah-...). +# Nazwa thumba: `-YYYY-MM-DD--cover.jpg`. _THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE) # Tytuł: ` YY MM DD ` (np. "MySexMobile 20 10 23 Abella Danger"). -# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger"). _TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b") +# Karty homepage zawsze siedzą w kategorii "latest-porn-videos" — to nie jest tag. +_CAT_SKIP = {"latest-porn-videos", "uncategorized", ""} def _name_from_slug(slug: str) -> str: return " ".join(w.capitalize() for w in slug.split("-") if w) -class LatestPornVideoScraper(BaseSearchScraper): +class LatestPornVideoScraper(BaseBrowseScraper): sitetag = "latestpornvideocom" - def search( - self, query: str, *, page: int = 1, limit: int | None = None - ) -> Iterator[RawScene]: - actor_slug = slugify(query) - if not actor_slug: - return - url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") - try: - r = browser_get(url, timeout=self._timeout) - except Exception as e: - log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e) - return - if r.status_code != 200: - return + def _listing_url(self, page: int) -> str: + return _BASE + "/" if page <= 1 else f"{_BASE}/page/{page}/" + + # crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji. + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [m.group(1) for m in _LINK_RE.finditer(listing_html)] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + text = res.text if hasattr(res, "text") else res + except Exception as e: + log.warning("latestpornvideo browse listing fetch failed (page %d): %s", page, e) + return None - text = r.text anchors = list(_ARTICLE_RE.finditer(text)) + out: list[RawScene] = [] seen: set[str] = set() - yielded = 0 for idx, m in enumerate(anchors): cls = m.group(1) win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500 @@ -84,7 +93,8 @@ class LatestPornVideoScraper(BaseSearchScraper): if not link_m: continue scene_url = link_m.group(1).rstrip("/") + "/" - if not scene_url.startswith(_BASE) or scene_url in seen: + # tylko właściwe posty scen (//), bez nav/kategorii + if not re.fullmatch(rf"{re.escape(_BASE)}/\d+/", scene_url) or scene_url in seen: continue seen.add(scene_url) title = html.unescape(link_m.group(2)).strip() @@ -94,42 +104,12 @@ class LatestPornVideoScraper(BaseSearchScraper): thumb_m = _THUMB_RE.search(window) thumb = thumb_m.group(1) if thumb_m else None - # Performerzy z klasy. - performers: list[RawPerformer] = [] - perf_tokens: set[str] = set() - seen_perf: set[str] = set() - for am in _CLASS_ACTOR_RE.finditer(cls): - sl = am.group(1) - if sl in seen_perf: - continue - seen_perf.add(sl) - perf_tokens.update(sl.split("-")) - performers.append( - RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl)) - ) - if not performers: - perf_tokens.update(actor_slug.split("-")) - performers.append( - RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip()) - ) - - # Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera. - tags: list[RawTag] = [] - seen_tag: set[str] = set() - for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)): - sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1)) - if not sl or sl in seen_tag or sl in perf_tokens: - continue - seen_tag.add(sl) - tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl)) - # Studio + release_date z nazwy thumba (`-YYYY-MM-DD-`). studio: RawStudio | None = None release_date: date | None = None if thumb and (tn := _THUMB_NAME_RE.search(thumb)): studio_raw = tn.group(1).replace("-", " ").strip() - # Pomiń gdy "studio" to w istocie imię performera. - if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}: + if studio_raw: studio = RawStudio( external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", name=studio_raw, slug=slugify(studio_raw), @@ -139,7 +119,7 @@ class LatestPornVideoScraper(BaseSearchScraper): except ValueError: release_date = None - # Fallback z tytułu: ` YY MM DD ...` gdy thumb nie dał studio/daty. + # Fallback z tytułu: ` YY MM DD ...`. if studio is None or release_date is None: if tm2 := _TITLE_DATE_RE.search(title): if release_date is None: @@ -150,31 +130,51 @@ class LatestPornVideoScraper(BaseSearchScraper): except ValueError: release_date = None studio_raw = tm2.group(1).strip(" -–") - if ( - studio is None and 2 <= len(studio_raw) <= 30 - and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers} - ): + if studio is None and 2 <= len(studio_raw) <= 30: studio = RawStudio( external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", name=studio_raw, slug=slugify(studio_raw), ) - yield RawScene( - external_id=f"{self.sitetag}:{scene_url}", - title=title, - release_date=release_date, - url=scene_url, - studio=studio, - performers=performers, - tags=tags, - playback_sources=[ - RawPlaybackSource( - origin=f"tube:{self.sitetag}", - page_url=scene_url, - thumbnail_url=thumb, - ) - ], + # Tagi: tylko prawdziwe kategorie (category-*), bez "latest-porn-videos". + # `tag-*` POMIJAMY — to mieszanka fragmentów imion performerów i gatunków, + # bez `actors-*` (jak na /actor/) nie da się ich rozdzielić → byłby szum. + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for cm in _CLASS_CAT_RE.finditer(cls): + sl = cm.group(1) + if sl in _CAT_SKIP or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl)) + + fingerprints = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + from app.connectors.base import RawFingerprint + + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title, + release_date=release_date, + url=scene_url, + studio=studio, + performers=[], + tags=tags, + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + thumbnail_url=thumb, + ) + ], + ) ) - yielded += 1 - if limit is not None and yielded >= limit: - return + + log.info("latestpornvideo browse page %d: %d scenes", page, len(out)) + return out