diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index 1a52701..12319bf 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -103,7 +103,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # lepszymi źródłami. Dane/pliki scraperów/extractory skasowane. # Special SxyPrnScraper, - PerverzijaScraper, + # PerverzijaScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22, + # user request). Search `?s=` → 429, homepage JS-renderowane; browse przez WP REST API + # (/wp-json/wp/v2/posts) daje tytuł/datę/thumb/studio(category)/tagi. Playback embed-iframe. # FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22, # user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z # `/new-/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS). @@ -136,6 +138,7 @@ from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402 ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ + PerverzijaScraper, FreshpornoScraper, FpoxxxScraper, # LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven diff --git a/app/connectors/direct_scrapers/perverzija.py b/app/connectors/direct_scrapers/perverzija.py index 5411a5e..e611cb5 100644 --- a/app/connectors/direct_scrapers/perverzija.py +++ b/app/connectors/direct_scrapers/perverzija.py @@ -1,21 +1,148 @@ -"""perverzija.com — direct HTML scrape search results. +"""perverzija.com — latest browse scraper via WordPress REST API. -Search: `https://www.perverzija.com/page//?s=` (WordPress + Cloudflare). -Scene URL: `https://www.perverzija.com//`. +Historia: dawniej search scraper (`?s=`), ale 2026-06 perverzija rate-limituje search +(429) a homepage jest JS-renderowane (brak linków postów w surowym HTML) → search +zwracał 0. To WordPress, więc czysty kanał to REST API: `/wp-json/wp/v2/posts` daje +ustrukturyzowany JSON (link, date, title, featured thumb, taksonomie) jednym requestem +na stronę. VPS dociera (curl_cffi bypassuje JA3; 200 nie 403). Przerobione na browse +2026-06-22 (user request). -CF-protected: `browser_get` (curl_cffi) bypassuje JA3 fingerprint blocks. +Z REST `?_embed=1` bierzemy: tytuł, datę, miniaturę (featured_media), STUDIO +(taksonomia `category` — np. "DadCrush"/"TeamSkeet", to studyjny re-up) i tagi +(`post_tag`). Performerów REST nie wystawia (custom taksonomia `stars` bez show_in_rest) +→ puste, dorabia canonical-merge (content studyjny dobrze matchuje TPDB/StashDB; tytuł +i tak ma nazwiska). + +Playback: post page (tube.perverzija.com//) embeduje xtremestream iframe → +extractor `perverzijacom` → `_embed_iframe` → hoster resolwowany phone-side. """ from __future__ import annotations -import re +import html +import json +import logging +from datetime import date, datetime -from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.connectors.base import ( + RawFingerprint, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) +from app.connectors.direct_scrapers._browse_base import ( + BaseBrowseScraper, + compute_thumbnail_phash, +) +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://www.perverzija.com" +_PER_PAGE = 20 -class PerverzijaScraper(BaseSearchScraper): +def _parse_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + return None + + +class PerverzijaScraper(BaseBrowseScraper): sitetag = "perverzijacom" - _search_url_template = "https://www.perverzija.com/page/{page}/?s={query}" - _scene_url_re = re.compile( - r'href="(?Phttps://www\.perverzija\.com/(?P[a-z0-9][a-z0-9\-]+))/"', - re.IGNORECASE, - ) + + def _listing_url(self, page: int) -> str: + return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1" + + # crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane. + def _extract_scene_urls(self, listing_html: str) -> list[str]: + return [] + + def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: + return None + + def crawl_page(self, page: int) -> list[RawScene] | None: + url = self._listing_url(page) + try: + res = browser_get(url, timeout=self._timeout) + except Exception as e: + log.warning("perverzija REST fetch failed (page %d): %s", page, e) + return None + # WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted. + if res.status_code != 200: + return [] + try: + posts = json.loads(res.text) + except (json.JSONDecodeError, ValueError): + log.warning("perverzija REST: bad JSON page %d", page) + return None + if not isinstance(posts, list) or not posts: + return [] + + out: list[RawScene] = [] + for p in posts: + link = (p.get("link") or "").strip() + title = html.unescape((p.get("title") or {}).get("rendered", "")).strip() + if not link or not title: + continue + release_date = _parse_date(p.get("date")) + + emb = p.get("_embedded") or {} + fm = emb.get("wp:featuredmedia") or [] + thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None + + studio: RawStudio | None = None + tags: list[RawTag] = [] + seen_tag: set[str] = set() + for group in emb.get("wp:term") or []: + if not group: + continue + tax = group[0].get("taxonomy") + if tax == "category" and studio is None: + sname = (group[0].get("name") or "").strip() + if sname: + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(sname)}", + name=sname, slug=slugify(sname), + ) + elif tax == "post_tag": + for g in group: + name = (g.get("name") or "").strip() + sl = (g.get("slug") or slugify(name)).strip() + if not name or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) + + fingerprints: list[RawFingerprint] = [] + if thumb: + ph = compute_thumbnail_phash(thumb, referer=_BASE + "/") + if ph: + fingerprints.append(RawFingerprint(kind="phash", value=ph)) + + out.append( + RawScene( + external_id=f"{self.sitetag}:{link}", + title=title, + release_date=release_date, + url=link, + studio=studio, + performers=[], + tags=tags, + fingerprints=fingerprints, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=link, + thumbnail_url=thumb, + ) + ], + ) + ) + + log.info("perverzija REST page %d: %d scenes", page, len(out)) + return out