From 5b67aeeeaf084e1590f944991050e325fc6e8cae Mon Sep 17 00:00:00 2001 From: jtrzupek Date: Tue, 16 Jun 2026 23:11:44 +0200 Subject: [PATCH] fix(sxyland): revive search via /actor/ pages + rich metadata sxyland dropped the /// scene URL format for //, so the old regex matched nothing (frozen since 06-07). Rewrote search() to use the performer page /actor// and fetch each scene for full metadata: all performers (with co-stars, from /actor/ links), tags (scoped to the scene's tags-list, not the sidebar), duration + upload date (itemprop), studio from the title prefix (BraZZers/MilfCoach/... , guarded so a performer-name prefix isn't mistaken for a studio). Junk nav pages (Terms of Use etc.) are dropped via a no-duration-and-no-tags guard. Verified: clean studio/performers/tags in DB, 0 errors. Co-Authored-By: Claude Opus 4.8 --- app/connectors/direct_scrapers/sxyland.py | 228 ++++++++++++++++++---- 1 file changed, 185 insertions(+), 43 deletions(-) diff --git a/app/connectors/direct_scrapers/sxyland.py b/app/connectors/direct_scrapers/sxyland.py index 0a602a2..1f50766 100644 --- a/app/connectors/direct_scrapers/sxyland.py +++ b/app/connectors/direct_scrapers/sxyland.py @@ -1,78 +1,220 @@ -"""SxyLandScraper — direct HTML scrape sxyland.com search. +"""sxyland.com — performer-page scrape (search-based, performer-driven). -Search: `https://sxyland.com/?s=` zwraca wyniki w formacie -`https://sxyland.com///`. Filtrujemy linki bez numeric ID -(legal pages typu /18-u-s-c-2257/). +2026-06-16 fix (zamrożony od 06-07): sxyland porzucił URL scen `///` +na rzecz `//`, więc stary regex (wymagał cyfry w ścieżce) dawał 0. WordPress `?s=` +filtruje, ale miesza — czystsze są **strony performera** `/actor//` +(performer-driven query = nazwa performera → slugify → /actor//). + +Bogate metadane (per-scene detail fetch — sxyland to WP tube, taksonomie na scenie): + - performerzy: WSZYSTKIE `/actor//` linki (z co-starami; `title="Name"`) + - tagi: `/tag/` + `/category/` (`title="Name"`); część to studia (BangBros/BLACKED/...) + - studio: heurystycznie z tagów-paysite (`_STUDIO_TAGS`); brak match → bez studio + - duration: `itemprop="duration"` ISO 8601 z dniami (P0DT0H41M12S) + - release date: `itemprop="uploadDate"` + - title: `og:title` / `itemprop="name"` + +Playback przez extractor `sxylandcom` (_embed_iframe → playmogo/dood, phone-side). """ from __future__ import annotations +import html import logging import re -import urllib.parse from collections.abc import Iterator +from datetime import date, datetime -from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene +from app.connectors.base import ( + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.extractors import browser_get +from app.normalize.text import slugify log = logging.getLogger(__name__) +_BASE = "https://sxyland.com" -_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"') +# Linki scen na stronie performera: // (multi-word). Wykluczamy taksonomie/nav. +_SCENE_URL_RE = re.compile(r'href="https://sxyland\.com/([a-z0-9][a-z0-9-]+)/"') +_NAV_SLUGS = frozenset({ + "actor", "actors", "category", "categories", "tag", "tags", "page", "author", + "models", "studios", "search", "home", "login", "register", "18-u-s-c-2257", + "privacy-policy", "cookie-policy", "dmca", "dmca-notice", "contact", "contact-us", + "terms", "terms-of-use", "about", "about-us", "2257", +}) +# Scena-tagi siedzą w pierwszym
...
(NIE w sidebarze/ +# popular-tags widgetcie). Bez scope'u studio łapało globalny "bangbros" na każdej scenie. +_TAGS_BLOCK_RE = re.compile(r'
(.*?)
', re.IGNORECASE | re.DOTALL) + +_ACTOR_LINK_RE = re.compile( + r'href="https://sxyland\.com/actor/[^"/]+/"\s+title="([^"]+)"', re.IGNORECASE +) +_TAG_LINK_RE = re.compile( + r'href="https://sxyland\.com/(?:tag|category)/[^"/]+/"[^>]*title="([^"]+)"', re.IGNORECASE +) +_DURATION_RE = re.compile(r'itemprop="duration"\s+content="([^"]+)"', re.IGNORECASE) +_UPLOADDATE_RE = re.compile(r'itemprop="uploadDate"\s+content="([^"]+)"', re.IGNORECASE) +_OGTITLE_RE = re.compile(r'property="og:title"\s+content="([^"]+)"', re.IGNORECASE) +_ISO_DUR_RE = re.compile( + r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE +) + +def _studio_from_title(title: str, performers: list[RawPerformer]) -> RawStudio | None: + """Studio z prefiksu "Studio - ..." tytułu (jak hdporngg: paysite reposty mają + "BraZZers - ...", "MilfCoach - ..."). Guard: prefiks NIE może być performerem + (tytuł "Amirah Adara - X" → prefiks to imię, nie studio). Brak " - " → brak studio.""" + if " - " not in title: + return None + prefix = title.split(" - ", 1)[0].strip() + if not (2 <= len(prefix) <= 30): + return None + pl = prefix.lower() + for p in performers: + if pl == p.name.lower() or pl in p.name.lower(): + return None + return RawStudio(external_id=f"sxylandcom:studio:{slugify(prefix)}", name=prefix, slug=slugify(prefix)) + + +def _parse_iso_duration(value: str | None) -> int | None: + """`P0DT0H41M12S` → 2472. None gdy zero/parse fail.""" + if not value: + return None + m = _ISO_DUR_RE.match(value.strip()) + if not m: + return None + d, h, mn, s = (int(g or 0) for g in m.groups()) + total = d * 86400 + h * 3600 + mn * 60 + s + return total or None + + +def _parse_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).date() + except ValueError: + m = re.match(r"(\d{4}-\d{2}-\d{2})", value) + return date.fromisoformat(m.group(1)) if m else None class SxyLandScraper(BaseDirectTubeScraper): sitetag = "sxylandcom" + _timeout: float = 30.0 def search( - self, - query: str, - *, - page: int = 1, - limit: int | None = None, + self, query: str, *, page: int = 1, limit: int | None = None ) -> Iterator[RawScene]: - q = urllib.parse.quote_plus(query.strip()) - url = f"https://sxyland.com/page/{page}/?s={q}" + actor_slug = slugify(query) + if not actor_slug: + return + listing = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "") try: - r = browser_get(url, timeout=30) + r = browser_get(listing, timeout=self._timeout) except Exception as e: - log.warning("sxyland search fetch failed: %s", e) + log.warning("sxyland actor-page fetch failed (%s): %s", listing, e) return if r.status_code != 200: + log.debug("sxyland %s status=%d", listing, r.status_code) return - query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} - + scene_urls: list[str] = [] seen: set[str] = set() - yielded = 0 for m in _SCENE_URL_RE.finditer(r.text): - scene_url = m.group(1) + "/" - slug = m.group(3) - if scene_url in seen: + slug = m.group(1) + if slug in _NAV_SLUGS or slug in seen: continue - seen.add(scene_url) + seen.add(slug) + scene_urls.append(f"{_BASE}/{slug}/") - slug_lower = slug.lower() - if query_tokens and not any(tok in slug_lower for tok in query_tokens): + yielded = 0 + for scene_url in scene_urls: + scene = self._parse_scene(scene_url, query) + if scene is None: continue - - title = slug.replace("-", " ").strip() - - yield RawScene( - external_id=f"sxylandcom:{scene_url}", - title=title, - url=scene_url, - playback_sources=[ - RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url) - ], - performers=[RawPerformer(name=query.strip())], - raw={ - "source": "direct_scraper:sxyland", - "query": query, - "page": page, - "url": scene_url, - }, - ) + yield scene yielded += 1 if limit is not None and yielded >= limit: return + + def _parse_scene(self, scene_url: str, query: str) -> RawScene | None: + try: + r = browser_get(scene_url, timeout=self._timeout) + if r.status_code != 200: + return None + detail = r.text + except Exception as e: + log.info("sxyland scene fetch failed %s: %s", scene_url, e) + return None + + title = _OGTITLE_RE.search(detail) + title_s = html.unescape(title.group(1)).strip() if title else "" + if not title_s: + return None + + dm = _DURATION_RE.search(detail) + duration_sec = _parse_iso_duration(dm.group(1)) if dm else None + um = _UPLOADDATE_RE.search(detail) + release_date = _parse_date(um.group(1)) if um else None + + # Performerzy: wszystkie /actor/ linki (z co-starami). + performers: list[RawPerformer] = [] + seen_perf: set[str] = set() + for m in _ACTOR_LINK_RE.finditer(detail): + name = html.unescape(m.group(1)).strip() + sl = slugify(name) + if not sl or sl in seen_perf: + continue + seen_perf.add(sl) + performers.append( + RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name) + ) + if not performers: + # Fallback: query (jesteśmy na /actor//, więc to na pewno ona). + performers.append( + RawPerformer( + external_id=f"{self.sitetag}:performer:{slugify(query)}", + name=query.strip(), + ) + ) + + # Tagi — TYLKO z bloku tagów sceny (nie z sidebara/popular widgetu). + tags: list[RawTag] = [] + seen_tag: set[str] = set() + block_m = _TAGS_BLOCK_RE.search(detail) + tags_html = block_m.group(1) if block_m else "" + for m in _TAG_LINK_RE.finditer(tags_html): + name = html.unescape(m.group(1)).strip() + sl = slugify(name) + if not sl or sl in seen_tag: + continue + seen_tag.add(sl) + tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl)) + + # Guard "to realna scena wideo": nav/legal pages (Terms of Use itp.) mają + # sidebar z aktorami (fałszywi performerzy) ale ZERO duration i ZERO tagów. + if duration_sec is None and not tags: + return None + + studio = _studio_from_title(title_s, performers) + + return RawScene( + external_id=f"{self.sitetag}:{scene_url}", + title=title_s, + duration_sec=duration_sec, + release_date=release_date, + url=scene_url, + studio=studio, + performers=performers, + tags=tags, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + ) + ], + )