"""sxyland.com — performer-page scrape (search-based, performer-driven). 2026-06-16 fix (zamrożony od 06-07): sxyland porzucił URL scen `///` na rzecz `//`, więc stary regex (wymagał cyfry w ścieżce) dawał 0. WordPress `?s=` filtruje, ale miesza — czystsze są **strony performera** `/actor//` (performer-driven query = nazwa performera → slugify → /actor//). Bogate metadane (per-scene detail fetch — sxyland to WP tube, taksonomie na scenie): - performerzy: WSZYSTKIE `/actor//` linki (z co-starami; `title="Name"`) - tagi: `/tag/` + `/category/` (`title="Name"`); część to studia (BangBros/BLACKED/...) - studio: heurystycznie z tagów-paysite (`_STUDIO_TAGS`); brak match → bez studio - duration: `itemprop="duration"` ISO 8601 z dniami (P0DT0H41M12S) - release date: `itemprop="uploadDate"` - title: `og:title` / `itemprop="name"` Playback przez extractor `sxylandcom` (_embed_iframe → playmogo/dood, phone-side). """ from __future__ import annotations import html import logging import re from collections.abc import Iterator from datetime import date, datetime from app.connectors.base import ( RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag, ) from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.extractors import browser_get from app.normalize.text import slugify log = logging.getLogger(__name__) _BASE = "https://sxyland.com" # Linki scen na stronie performera: // (multi-word). Wykluczamy taksonomie/nav. _SCENE_URL_RE = re.compile(r'href="https://sxyland\.com/([a-z0-9][a-z0-9-]+)/"') _NAV_SLUGS = frozenset({ "actor", "actors", "category", "categories", "tag", "tags", "page", "author", "models", "studios", "search", "home", "login", "register", "18-u-s-c-2257", "privacy-policy", "cookie-policy", "dmca", "dmca-notice", "contact", "contact-us", "terms", "terms-of-use", "about", "about-us", "2257", }) # Scena-tagi siedzą w pierwszym
...
(NIE w sidebarze/ # popular-tags widgetcie). Bez scope'u studio łapało globalny "bangbros" na każdej scenie. _TAGS_BLOCK_RE = re.compile(r'
(.*?)
', re.IGNORECASE | re.DOTALL) _ACTOR_LINK_RE = re.compile( r'href="https://sxyland\.com/actor/[^"/]+/"\s+title="([^"]+)"', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="https://sxyland\.com/(?:tag|category)/[^"/]+/"[^>]*title="([^"]+)"', re.IGNORECASE ) _DURATION_RE = re.compile(r'itemprop="duration"\s+content="([^"]+)"', re.IGNORECASE) _UPLOADDATE_RE = re.compile(r'itemprop="uploadDate"\s+content="([^"]+)"', re.IGNORECASE) _OGTITLE_RE = re.compile(r'property="og:title"\s+content="([^"]+)"', re.IGNORECASE) _ISO_DUR_RE = re.compile( r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE ) def _studio_from_title(title: str, performers: list[RawPerformer]) -> RawStudio | None: """Studio z prefiksu "Studio - ..." tytułu (jak hdporngg: paysite reposty mają "BraZZers - ...", "MilfCoach - ..."). Guard: prefiks NIE może być performerem (tytuł "Amirah Adara - X" → prefiks to imię, nie studio). Brak " - " → brak studio.""" if " - " not in title: return None prefix = title.split(" - ", 1)[0].strip() if not (2 <= len(prefix) <= 30): return None pl = prefix.lower() for p in performers: if pl == p.name.lower() or pl in p.name.lower(): return None return RawStudio(external_id=f"sxylandcom:studio:{slugify(prefix)}", name=prefix, slug=slugify(prefix)) def _parse_iso_duration(value: str | None) -> int | None: """`P0DT0H41M12S` → 2472. None gdy zero/parse fail.""" if not value: return None m = _ISO_DUR_RE.match(value.strip()) if not m: return None d, h, mn, s = (int(g or 0) for g in m.groups()) total = d * 86400 + h * 3600 + mn * 60 + s return total or None def _parse_date(value: str | None) -> date | None: if not value: return None try: return datetime.fromisoformat(value.replace("Z", "+00:00")).date() except ValueError: m = re.match(r"(\d{4}-\d{2}-\d{2})", value) return date.fromisoformat(m.group(1)) if m else None class SxyLandScraper(BaseDirectTubeScraper): sitetag = "sxylandcom" _timeout: float = 30.0 def search( self, query: str, *, page: int = 1, limit: int | None = None ) -> Iterator[RawScene]: actor_slug = slugify(query) if not actor_slug: return listing = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") try: r = browser_get(listing, timeout=self._timeout) except Exception as e: log.warning("sxyland actor-page fetch failed (%s): %s", listing, e) return if r.status_code != 200: log.debug("sxyland %s status=%d", listing, r.status_code) return scene_urls: list[str] = [] seen: set[str] = set() for m in _SCENE_URL_RE.finditer(r.text): slug = m.group(1) if slug in _NAV_SLUGS or slug in seen: continue seen.add(slug) scene_urls.append(f"{_BASE}/{slug}/") yielded = 0 for scene_url in scene_urls: scene = self._parse_scene(scene_url, query) if scene is None: continue yield scene yielded += 1 if limit is not None and yielded >= limit: return def _parse_scene(self, scene_url: str, query: str) -> RawScene | None: try: r = browser_get(scene_url, timeout=self._timeout) if r.status_code != 200: return None detail = r.text except Exception as e: log.info("sxyland scene fetch failed %s: %s", scene_url, e) return None title = _OGTITLE_RE.search(detail) title_s = html.unescape(title.group(1)).strip() if title else "" if not title_s: return None dm = _DURATION_RE.search(detail) duration_sec = _parse_iso_duration(dm.group(1)) if dm else None um = _UPLOADDATE_RE.search(detail) release_date = _parse_date(um.group(1)) if um else None # Performerzy: wszystkie /actor/ linki (z co-starami). performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m in _ACTOR_LINK_RE.finditer(detail): name = html.unescape(m.group(1)).strip() sl = slugify(name) if not sl or sl in seen_perf: continue seen_perf.add(sl) performers.append( RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name) ) if not performers: # Fallback: query (jesteśmy na /actor//, więc to na pewno ona). performers.append( RawPerformer( external_id=f"{self.sitetag}:performer:{slugify(query)}", name=query.strip(), ) ) # Tagi — TYLKO z bloku tagów sceny (nie z sidebara/popular widgetu). tags: list[RawTag] = [] seen_tag: set[str] = set() block_m = _TAGS_BLOCK_RE.search(detail) tags_html = block_m.group(1) if block_m else "" for m in _TAG_LINK_RE.finditer(tags_html): name = html.unescape(m.group(1)).strip() sl = slugify(name) if not sl or sl in seen_tag: continue seen_tag.add(sl) tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) # Guard "to realna scena wideo": nav/legal pages (Terms of Use itp.) mają # sidebar z aktorami (fałszywi performerzy) ale ZERO duration i ZERO tagów. if duration_sec is None and not tags: return None studio = _studio_from_title(title_s, performers) return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title_s, duration_sec=duration_sec, release_date=release_date, url=scene_url, studio=studio, performers=performers, tags=tags, playback_sources=[ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, duration_sec=duration_sec, ) ], )