fix(sxyland): revive search via /actor/ pages + rich metadata

sxyland dropped the /<numeric_id>/<slug>/ scene URL format for /<slug>/,
so the old regex matched nothing (frozen since 06-07). Rewrote search()
to use the performer page /actor/<slug>/ and fetch each scene for full
metadata: all performers (with co-stars, from /actor/ links), tags
(scoped to the scene's tags-list, not the sidebar), duration + upload
date (itemprop), studio from the title prefix (BraZZers/MilfCoach/... ,
guarded so a performer-name prefix isn't mistaken for a studio). Junk
nav pages (Terms of Use etc.) are dropped via a no-duration-and-no-tags
guard. Verified: clean studio/performers/tags in DB, 0 errors.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-16 23:11:44 +02:00
parent e0e69189a8
commit 5b67aeeeaf

View file

@ -1,78 +1,220 @@
"""SxyLandScraper — direct HTML scrape sxyland.com search. """sxyland.com — performer-page scrape (search-based, performer-driven).
Search: `https://sxyland.com/?s=<query>` zwraca wyniki w formacie 2026-06-16 fix (zamrożony od 06-07): sxyland porzucił URL scen `/<numeric_id>/<slug>/`
`https://sxyland.com/<numeric_id>/<slug>/`. Filtrujemy linki bez numeric ID na rzecz `/<slug>/`, więc stary regex (wymagał cyfry w ścieżce) dawał 0. WordPress `?s=`
(legal pages typu /18-u-s-c-2257/). filtruje, ale miesza czystsze **strony performera** `/actor/<slug>/`
(performer-driven query = nazwa performera slugify /actor/<slug>/).
Bogate metadane (per-scene detail fetch sxyland to WP tube, taksonomie na scenie):
- performerzy: WSZYSTKIE `/actor/<slug>/` linki (z co-starami; `title="Name"`)
- tagi: `/tag/` + `/category/` (`title="Name"`); część to studia (BangBros/BLACKED/...)
- studio: heurystycznie z tagów-paysite (`_STUDIO_TAGS`); brak match bez studio
- duration: `itemprop="duration"` ISO 8601 z dniami (P0DT0H41M12S)
- release date: `itemprop="uploadDate"`
- title: `og:title` / `itemprop="name"`
Playback przez extractor `sxylandcom` (_embed_iframe playmogo/dood, phone-side).
""" """
from __future__ import annotations from __future__ import annotations
import html
import logging import logging
import re import re
import urllib.parse
from collections.abc import Iterator from collections.abc import Iterator
from datetime import date, datetime
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene from app.connectors.base import (
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get from app.extractors import browser_get
from app.normalize.text import slugify
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
_BASE = "https://sxyland.com"
_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"') # Linki scen na stronie performera: /<slug>/ (multi-word). Wykluczamy taksonomie/nav.
_SCENE_URL_RE = re.compile(r'href="https://sxyland\.com/([a-z0-9][a-z0-9-]+)/"')
_NAV_SLUGS = frozenset({
"actor", "actors", "category", "categories", "tag", "tags", "page", "author",
"models", "studios", "search", "home", "login", "register", "18-u-s-c-2257",
"privacy-policy", "cookie-policy", "dmca", "dmca-notice", "contact", "contact-us",
"terms", "terms-of-use", "about", "about-us", "2257",
})
# Scena-tagi siedzą w pierwszym <div class="tags-list">...</div> (NIE w sidebarze/
# popular-tags widgetcie). Bez scope'u studio łapało globalny "bangbros" na każdej scenie.
_TAGS_BLOCK_RE = re.compile(r'<div class="tags-list">(.*?)</div>', re.IGNORECASE | re.DOTALL)
_ACTOR_LINK_RE = re.compile(
r'href="https://sxyland\.com/actor/[^"/]+/"\s+title="([^"]+)"', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://sxyland\.com/(?:tag|category)/[^"/]+/"[^>]*title="([^"]+)"', re.IGNORECASE
)
_DURATION_RE = re.compile(r'itemprop="duration"\s+content="([^"]+)"', re.IGNORECASE)
_UPLOADDATE_RE = re.compile(r'itemprop="uploadDate"\s+content="([^"]+)"', re.IGNORECASE)
_OGTITLE_RE = re.compile(r'property="og:title"\s+content="([^"]+)"', re.IGNORECASE)
_ISO_DUR_RE = re.compile(
r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE
)
def _studio_from_title(title: str, performers: list[RawPerformer]) -> RawStudio | None:
"""Studio z prefiksu "Studio - ..." tytułu (jak hdporngg: paysite reposty mają
"BraZZers - ...", "MilfCoach - ..."). Guard: prefiks NIE może być performerem
(tytuł "Amirah Adara - X" prefiks to imię, nie studio). Brak " - " brak studio."""
if " - " not in title:
return None
prefix = title.split(" - ", 1)[0].strip()
if not (2 <= len(prefix) <= 30):
return None
pl = prefix.lower()
for p in performers:
if pl == p.name.lower() or pl in p.name.lower():
return None
return RawStudio(external_id=f"sxylandcom:studio:{slugify(prefix)}", name=prefix, slug=slugify(prefix))
def _parse_iso_duration(value: str | None) -> int | None:
"""`P0DT0H41M12S` → 2472. None gdy zero/parse fail."""
if not value:
return None
m = _ISO_DUR_RE.match(value.strip())
if not m:
return None
d, h, mn, s = (int(g or 0) for g in m.groups())
total = d * 86400 + h * 3600 + mn * 60 + s
return total or None
def _parse_date(value: str | None) -> date | None:
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
except ValueError:
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
return date.fromisoformat(m.group(1)) if m else None
class SxyLandScraper(BaseDirectTubeScraper): class SxyLandScraper(BaseDirectTubeScraper):
sitetag = "sxylandcom" sitetag = "sxylandcom"
_timeout: float = 30.0
def search( def search(
self, self, query: str, *, page: int = 1, limit: int | None = None
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]: ) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip()) actor_slug = slugify(query)
url = f"https://sxyland.com/page/{page}/?s={q}" if not actor_slug:
return
listing = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
try: try:
r = browser_get(url, timeout=30) r = browser_get(listing, timeout=self._timeout)
except Exception as e: except Exception as e:
log.warning("sxyland search fetch failed: %s", e) log.warning("sxyland actor-page fetch failed (%s): %s", listing, e)
return return
if r.status_code != 200: if r.status_code != 200:
log.debug("sxyland %s status=%d", listing, r.status_code)
return return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3} scene_urls: list[str] = []
seen: set[str] = set() seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text): for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/" slug = m.group(1)
slug = m.group(3) if slug in _NAV_SLUGS or slug in seen:
if scene_url in seen:
continue continue
seen.add(scene_url) seen.add(slug)
scene_urls.append(f"{_BASE}/{slug}/")
slug_lower = slug.lower() yielded = 0
if query_tokens and not any(tok in slug_lower for tok in query_tokens): for scene_url in scene_urls:
scene = self._parse_scene(scene_url, query)
if scene is None:
continue continue
yield scene
title = slug.replace("-", " ").strip()
yield RawScene(
external_id=f"sxylandcom:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:sxyland",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1 yielded += 1
if limit is not None and yielded >= limit: if limit is not None and yielded >= limit:
return return
def _parse_scene(self, scene_url: str, query: str) -> RawScene | None:
try:
r = browser_get(scene_url, timeout=self._timeout)
if r.status_code != 200:
return None
detail = r.text
except Exception as e:
log.info("sxyland scene fetch failed %s: %s", scene_url, e)
return None
title = _OGTITLE_RE.search(detail)
title_s = html.unescape(title.group(1)).strip() if title else ""
if not title_s:
return None
dm = _DURATION_RE.search(detail)
duration_sec = _parse_iso_duration(dm.group(1)) if dm else None
um = _UPLOADDATE_RE.search(detail)
release_date = _parse_date(um.group(1)) if um else None
# Performerzy: wszystkie /actor/ linki (z co-starami).
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _ACTOR_LINK_RE.finditer(detail):
name = html.unescape(m.group(1)).strip()
sl = slugify(name)
if not sl or sl in seen_perf:
continue
seen_perf.add(sl)
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
)
if not performers:
# Fallback: query (jesteśmy na /actor/<query>/, więc to na pewno ona).
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:performer:{slugify(query)}",
name=query.strip(),
)
)
# Tagi — TYLKO z bloku tagów sceny (nie z sidebara/popular widgetu).
tags: list[RawTag] = []
seen_tag: set[str] = set()
block_m = _TAGS_BLOCK_RE.search(detail)
tags_html = block_m.group(1) if block_m else ""
for m in _TAG_LINK_RE.finditer(tags_html):
name = html.unescape(m.group(1)).strip()
sl = slugify(name)
if not sl or sl in seen_tag:
continue
seen_tag.add(sl)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
# Guard "to realna scena wideo": nav/legal pages (Terms of Use itp.) mają
# sidebar z aktorami (fałszywi performerzy) ale ZERO duration i ZERO tagów.
if duration_sec is None and not tags:
return None
studio = _studio_from_title(title_s, performers)
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title_s,
duration_sec=duration_sec,
release_date=release_date,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
)
],
)