fix(sxyland): revive search via /actor/ pages + rich metadata
sxyland dropped the /<numeric_id>/<slug>/ scene URL format for /<slug>/, so the old regex matched nothing (frozen since 06-07). Rewrote search() to use the performer page /actor/<slug>/ and fetch each scene for full metadata: all performers (with co-stars, from /actor/ links), tags (scoped to the scene's tags-list, not the sidebar), duration + upload date (itemprop), studio from the title prefix (BraZZers/MilfCoach/... , guarded so a performer-name prefix isn't mistaken for a studio). Junk nav pages (Terms of Use etc.) are dropped via a no-duration-and-no-tags guard. Verified: clean studio/performers/tags in DB, 0 errors. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e0e69189a8
commit
5b67aeeeaf
1 changed files with 185 additions and 43 deletions
|
|
@ -1,78 +1,220 @@
|
|||
"""SxyLandScraper — direct HTML scrape sxyland.com search.
|
||||
"""sxyland.com — performer-page scrape (search-based, performer-driven).
|
||||
|
||||
Search: `https://sxyland.com/?s=<query>` zwraca wyniki w formacie
|
||||
`https://sxyland.com/<numeric_id>/<slug>/`. Filtrujemy linki bez numeric ID
|
||||
(legal pages typu /18-u-s-c-2257/).
|
||||
2026-06-16 fix (zamrożony od 06-07): sxyland porzucił URL scen `/<numeric_id>/<slug>/`
|
||||
na rzecz `/<slug>/`, więc stary regex (wymagał cyfry w ścieżce) dawał 0. WordPress `?s=`
|
||||
filtruje, ale miesza — czystsze są **strony performera** `/actor/<slug>/`
|
||||
(performer-driven query = nazwa performera → slugify → /actor/<slug>/).
|
||||
|
||||
Bogate metadane (per-scene detail fetch — sxyland to WP tube, taksonomie na scenie):
|
||||
- performerzy: WSZYSTKIE `/actor/<slug>/` linki (z co-starami; `title="Name"`)
|
||||
- tagi: `/tag/` + `/category/` (`title="Name"`); część to studia (BangBros/BLACKED/...)
|
||||
- studio: heurystycznie z tagów-paysite (`_STUDIO_TAGS`); brak match → bez studio
|
||||
- duration: `itemprop="duration"` ISO 8601 z dniami (P0DT0H41M12S)
|
||||
- release date: `itemprop="uploadDate"`
|
||||
- title: `og:title` / `itemprop="name"`
|
||||
|
||||
Playback przez extractor `sxylandcom` (_embed_iframe → playmogo/dood, phone-side).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
import urllib.parse
|
||||
from collections.abc import Iterator
|
||||
from datetime import date, datetime
|
||||
|
||||
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
||||
from app.connectors.base import (
|
||||
RawPerformer,
|
||||
RawPlaybackSource,
|
||||
RawScene,
|
||||
RawStudio,
|
||||
RawTag,
|
||||
)
|
||||
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||
from app.extractors import browser_get
|
||||
from app.normalize.text import slugify
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://sxyland.com"
|
||||
|
||||
_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"')
|
||||
# Linki scen na stronie performera: /<slug>/ (multi-word). Wykluczamy taksonomie/nav.
|
||||
_SCENE_URL_RE = re.compile(r'href="https://sxyland\.com/([a-z0-9][a-z0-9-]+)/"')
|
||||
_NAV_SLUGS = frozenset({
|
||||
"actor", "actors", "category", "categories", "tag", "tags", "page", "author",
|
||||
"models", "studios", "search", "home", "login", "register", "18-u-s-c-2257",
|
||||
"privacy-policy", "cookie-policy", "dmca", "dmca-notice", "contact", "contact-us",
|
||||
"terms", "terms-of-use", "about", "about-us", "2257",
|
||||
})
|
||||
# Scena-tagi siedzą w pierwszym <div class="tags-list">...</div> (NIE w sidebarze/
|
||||
# popular-tags widgetcie). Bez scope'u studio łapało globalny "bangbros" na każdej scenie.
|
||||
_TAGS_BLOCK_RE = re.compile(r'<div class="tags-list">(.*?)</div>', re.IGNORECASE | re.DOTALL)
|
||||
|
||||
_ACTOR_LINK_RE = re.compile(
|
||||
r'href="https://sxyland\.com/actor/[^"/]+/"\s+title="([^"]+)"', re.IGNORECASE
|
||||
)
|
||||
_TAG_LINK_RE = re.compile(
|
||||
r'href="https://sxyland\.com/(?:tag|category)/[^"/]+/"[^>]*title="([^"]+)"', re.IGNORECASE
|
||||
)
|
||||
_DURATION_RE = re.compile(r'itemprop="duration"\s+content="([^"]+)"', re.IGNORECASE)
|
||||
_UPLOADDATE_RE = re.compile(r'itemprop="uploadDate"\s+content="([^"]+)"', re.IGNORECASE)
|
||||
_OGTITLE_RE = re.compile(r'property="og:title"\s+content="([^"]+)"', re.IGNORECASE)
|
||||
_ISO_DUR_RE = re.compile(
|
||||
r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE
|
||||
)
|
||||
|
||||
def _studio_from_title(title: str, performers: list[RawPerformer]) -> RawStudio | None:
|
||||
"""Studio z prefiksu "Studio - ..." tytułu (jak hdporngg: paysite reposty mają
|
||||
"BraZZers - ...", "MilfCoach - ..."). Guard: prefiks NIE może być performerem
|
||||
(tytuł "Amirah Adara - X" → prefiks to imię, nie studio). Brak " - " → brak studio."""
|
||||
if " - " not in title:
|
||||
return None
|
||||
prefix = title.split(" - ", 1)[0].strip()
|
||||
if not (2 <= len(prefix) <= 30):
|
||||
return None
|
||||
pl = prefix.lower()
|
||||
for p in performers:
|
||||
if pl == p.name.lower() or pl in p.name.lower():
|
||||
return None
|
||||
return RawStudio(external_id=f"sxylandcom:studio:{slugify(prefix)}", name=prefix, slug=slugify(prefix))
|
||||
|
||||
|
||||
def _parse_iso_duration(value: str | None) -> int | None:
|
||||
"""`P0DT0H41M12S` → 2472. None gdy zero/parse fail."""
|
||||
if not value:
|
||||
return None
|
||||
m = _ISO_DUR_RE.match(value.strip())
|
||||
if not m:
|
||||
return None
|
||||
d, h, mn, s = (int(g or 0) for g in m.groups())
|
||||
total = d * 86400 + h * 3600 + mn * 60 + s
|
||||
return total or None
|
||||
|
||||
|
||||
def _parse_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||
except ValueError:
|
||||
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
||||
return date.fromisoformat(m.group(1)) if m else None
|
||||
|
||||
|
||||
class SxyLandScraper(BaseDirectTubeScraper):
|
||||
sitetag = "sxylandcom"
|
||||
_timeout: float = 30.0
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
page: int = 1,
|
||||
limit: int | None = None,
|
||||
self, query: str, *, page: int = 1, limit: int | None = None
|
||||
) -> Iterator[RawScene]:
|
||||
q = urllib.parse.quote_plus(query.strip())
|
||||
url = f"https://sxyland.com/page/{page}/?s={q}"
|
||||
actor_slug = slugify(query)
|
||||
if not actor_slug:
|
||||
return
|
||||
listing = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
|
||||
try:
|
||||
r = browser_get(url, timeout=30)
|
||||
r = browser_get(listing, timeout=self._timeout)
|
||||
except Exception as e:
|
||||
log.warning("sxyland search fetch failed: %s", e)
|
||||
log.warning("sxyland actor-page fetch failed (%s): %s", listing, e)
|
||||
return
|
||||
if r.status_code != 200:
|
||||
log.debug("sxyland %s status=%d", listing, r.status_code)
|
||||
return
|
||||
|
||||
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
||||
|
||||
scene_urls: list[str] = []
|
||||
seen: set[str] = set()
|
||||
yielded = 0
|
||||
for m in _SCENE_URL_RE.finditer(r.text):
|
||||
scene_url = m.group(1) + "/"
|
||||
slug = m.group(3)
|
||||
if scene_url in seen:
|
||||
slug = m.group(1)
|
||||
if slug in _NAV_SLUGS or slug in seen:
|
||||
continue
|
||||
seen.add(scene_url)
|
||||
seen.add(slug)
|
||||
scene_urls.append(f"{_BASE}/{slug}/")
|
||||
|
||||
slug_lower = slug.lower()
|
||||
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
||||
yielded = 0
|
||||
for scene_url in scene_urls:
|
||||
scene = self._parse_scene(scene_url, query)
|
||||
if scene is None:
|
||||
continue
|
||||
|
||||
title = slug.replace("-", " ").strip()
|
||||
|
||||
yield RawScene(
|
||||
external_id=f"sxylandcom:{scene_url}",
|
||||
title=title,
|
||||
url=scene_url,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url)
|
||||
],
|
||||
performers=[RawPerformer(name=query.strip())],
|
||||
raw={
|
||||
"source": "direct_scraper:sxyland",
|
||||
"query": query,
|
||||
"page": page,
|
||||
"url": scene_url,
|
||||
},
|
||||
)
|
||||
yield scene
|
||||
yielded += 1
|
||||
if limit is not None and yielded >= limit:
|
||||
return
|
||||
|
||||
def _parse_scene(self, scene_url: str, query: str) -> RawScene | None:
|
||||
try:
|
||||
r = browser_get(scene_url, timeout=self._timeout)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
detail = r.text
|
||||
except Exception as e:
|
||||
log.info("sxyland scene fetch failed %s: %s", scene_url, e)
|
||||
return None
|
||||
|
||||
title = _OGTITLE_RE.search(detail)
|
||||
title_s = html.unescape(title.group(1)).strip() if title else ""
|
||||
if not title_s:
|
||||
return None
|
||||
|
||||
dm = _DURATION_RE.search(detail)
|
||||
duration_sec = _parse_iso_duration(dm.group(1)) if dm else None
|
||||
um = _UPLOADDATE_RE.search(detail)
|
||||
release_date = _parse_date(um.group(1)) if um else None
|
||||
|
||||
# Performerzy: wszystkie /actor/ linki (z co-starami).
|
||||
performers: list[RawPerformer] = []
|
||||
seen_perf: set[str] = set()
|
||||
for m in _ACTOR_LINK_RE.finditer(detail):
|
||||
name = html.unescape(m.group(1)).strip()
|
||||
sl = slugify(name)
|
||||
if not sl or sl in seen_perf:
|
||||
continue
|
||||
seen_perf.add(sl)
|
||||
performers.append(
|
||||
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
|
||||
)
|
||||
if not performers:
|
||||
# Fallback: query (jesteśmy na /actor/<query>/, więc to na pewno ona).
|
||||
performers.append(
|
||||
RawPerformer(
|
||||
external_id=f"{self.sitetag}:performer:{slugify(query)}",
|
||||
name=query.strip(),
|
||||
)
|
||||
)
|
||||
|
||||
# Tagi — TYLKO z bloku tagów sceny (nie z sidebara/popular widgetu).
|
||||
tags: list[RawTag] = []
|
||||
seen_tag: set[str] = set()
|
||||
block_m = _TAGS_BLOCK_RE.search(detail)
|
||||
tags_html = block_m.group(1) if block_m else ""
|
||||
for m in _TAG_LINK_RE.finditer(tags_html):
|
||||
name = html.unescape(m.group(1)).strip()
|
||||
sl = slugify(name)
|
||||
if not sl or sl in seen_tag:
|
||||
continue
|
||||
seen_tag.add(sl)
|
||||
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||
|
||||
# Guard "to realna scena wideo": nav/legal pages (Terms of Use itp.) mają
|
||||
# sidebar z aktorami (fałszywi performerzy) ale ZERO duration i ZERO tagów.
|
||||
if duration_sec is None and not tags:
|
||||
return None
|
||||
|
||||
studio = _studio_from_title(title_s, performers)
|
||||
|
||||
return RawScene(
|
||||
external_id=f"{self.sitetag}:{scene_url}",
|
||||
title=title_s,
|
||||
duration_sec=duration_sec,
|
||||
release_date=release_date,
|
||||
url=scene_url,
|
||||
studio=studio,
|
||||
performers=performers,
|
||||
tags=tags,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=scene_url,
|
||||
duration_sec=duration_sec,
|
||||
)
|
||||
],
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue