From 5b67aeeeaf084e1590f944991050e325fc6e8cae Mon Sep 17 00:00:00 2001
From: jtrzupek <jtrzupek@gmail.com>
Date: Tue, 16 Jun 2026 23:11:44 +0200
Subject: [PATCH] fix(sxyland): revive search via /actor/ pages + rich metadata

sxyland dropped the /<numeric_id>/<slug>/ scene URL format for /<slug>/,
so the old regex matched nothing (frozen since 06-07). Rewrote search()
to use the performer page /actor/<slug>/ and fetch each scene for full
metadata: all performers (with co-stars, from /actor/ links), tags
(scoped to the scene's tags-list, not the sidebar), duration + upload
date (itemprop), studio from the title prefix (BraZZers/MilfCoach/... ,
guarded so a performer-name prefix isn't mistaken for a studio). Junk
nav pages (Terms of Use etc.) are dropped via a no-duration-and-no-tags
guard. Verified: clean studio/performers/tags in DB, 0 errors.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 app/connectors/direct_scrapers/sxyland.py | 228 ++++++++++++++++++----
 1 file changed, 185 insertions(+), 43 deletions(-)
diff --git a/app/connectors/direct_scrapers/sxyland.py b/app/connectors/direct_scrapers/sxyland.py
index 0a602a2..1f50766 100644
--- a/app/connectors/direct_scrapers/sxyland.py
+++ b/app/connectors/direct_scrapers/sxyland.py
@@ -1,78 +1,220 @@
-"""SxyLandScraper — direct HTML scrape sxyland.com search.
+"""sxyland.com — performer-page scrape (search-based, performer-driven).
 
-Search: `https://sxyland.com/?s=<query>` zwraca wyniki w formacie
-`https://sxyland.com/<numeric_id>/<slug>/`. Filtrujemy linki bez numeric ID
-(legal pages typu /18-u-s-c-2257/).
+2026-06-16 fix (zamrożony od 06-07): sxyland porzucił URL scen `/<numeric_id>/<slug>/`
+na rzecz `/<slug>/`, więc stary regex (wymagał cyfry w ścieżce) dawał 0. WordPress `?s=`
+filtruje, ale miesza — czystsze są **strony performera** `/actor/<slug>/`
+(performer-driven query = nazwa performera → slugify → /actor/<slug>/).
+
+Bogate metadane (per-scene detail fetch — sxyland to WP tube, taksonomie na scenie):
+  - performerzy: WSZYSTKIE `/actor/<slug>/` linki (z co-starami; `title="Name"`)
+  - tagi: `/tag/` + `/category/` (`title="Name"`); część to studia (BangBros/BLACKED/...)
+  - studio: heurystycznie z tagów-paysite (`_STUDIO_TAGS`); brak match → bez studio
+  - duration: `itemprop="duration"` ISO 8601 z dniami (P0DT0H41M12S)
+  - release date: `itemprop="uploadDate"`
+  - title: `og:title` / `itemprop="name"`
+
+Playback przez extractor `sxylandcom` (_embed_iframe → playmogo/dood, phone-side).
 """
 from __future__ import annotations
 
+import html
 import logging
 import re
-import urllib.parse
 from collections.abc import Iterator
+from datetime import date, datetime
 
-from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
+from app.connectors.base import (
+    RawPerformer,
+    RawPlaybackSource,
+    RawScene,
+    RawStudio,
+    RawTag,
+)
 from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
 from app.extractors import browser_get
+from app.normalize.text import slugify
 
 log = logging.getLogger(__name__)
 
+_BASE = "https://sxyland.com"
 
-_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"')
+# Linki scen na stronie performera: /<slug>/ (multi-word). Wykluczamy taksonomie/nav.
+_SCENE_URL_RE = re.compile(r'href="https://sxyland\.com/([a-z0-9][a-z0-9-]+)/"')
+_NAV_SLUGS = frozenset({
+    "actor", "actors", "category", "categories", "tag", "tags", "page", "author",
+    "models", "studios", "search", "home", "login", "register", "18-u-s-c-2257",
+    "privacy-policy", "cookie-policy", "dmca", "dmca-notice", "contact", "contact-us",
+    "terms", "terms-of-use", "about", "about-us", "2257",
+})
+# Scena-tagi siedzą w pierwszym <div class="tags-list">...</div> (NIE w sidebarze/
+# popular-tags widgetcie). Bez scope'u studio łapało globalny "bangbros" na każdej scenie.
+_TAGS_BLOCK_RE = re.compile(r'<div class="tags-list">(.*?)</div>', re.IGNORECASE | re.DOTALL)
+
+_ACTOR_LINK_RE = re.compile(
+    r'href="https://sxyland\.com/actor/[^"/]+/"\s+title="([^"]+)"', re.IGNORECASE
+)
+_TAG_LINK_RE = re.compile(
+    r'href="https://sxyland\.com/(?:tag|category)/[^"/]+/"[^>]*title="([^"]+)"', re.IGNORECASE
+)
+_DURATION_RE = re.compile(r'itemprop="duration"\s+content="([^"]+)"', re.IGNORECASE)
+_UPLOADDATE_RE = re.compile(r'itemprop="uploadDate"\s+content="([^"]+)"', re.IGNORECASE)
+_OGTITLE_RE = re.compile(r'property="og:title"\s+content="([^"]+)"', re.IGNORECASE)
+_ISO_DUR_RE = re.compile(
+    r"P(?:(\d+)D)?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE
+)
+
+def _studio_from_title(title: str, performers: list[RawPerformer]) -> RawStudio | None:
+    """Studio z prefiksu "Studio - ..." tytułu (jak hdporngg: paysite reposty mają
+    "BraZZers - ...", "MilfCoach - ..."). Guard: prefiks NIE może być performerem
+    (tytuł "Amirah Adara - X" → prefiks to imię, nie studio). Brak " - " → brak studio."""
+    if " - " not in title:
+        return None
+    prefix = title.split(" - ", 1)[0].strip()
+    if not (2 <= len(prefix) <= 30):
+        return None
+    pl = prefix.lower()
+    for p in performers:
+        if pl == p.name.lower() or pl in p.name.lower():
+            return None
+    return RawStudio(external_id=f"sxylandcom:studio:{slugify(prefix)}", name=prefix, slug=slugify(prefix))
+
+
+def _parse_iso_duration(value: str | None) -> int | None:
+    """`P0DT0H41M12S` → 2472. None gdy zero/parse fail."""
+    if not value:
+        return None
+    m = _ISO_DUR_RE.match(value.strip())
+    if not m:
+        return None
+    d, h, mn, s = (int(g or 0) for g in m.groups())
+    total = d * 86400 + h * 3600 + mn * 60 + s
+    return total or None
+
+
+def _parse_date(value: str | None) -> date | None:
+    if not value:
+        return None
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
+    except ValueError:
+        m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
+        return date.fromisoformat(m.group(1)) if m else None
 
 
 class SxyLandScraper(BaseDirectTubeScraper):
     sitetag = "sxylandcom"
+    _timeout: float = 30.0
 
     def search(
-        self,
-        query: str,
-        *,
-        page: int = 1,
-        limit: int | None = None,
+        self, query: str, *, page: int = 1, limit: int | None = None
     ) -> Iterator[RawScene]:
-        q = urllib.parse.quote_plus(query.strip())
-        url = f"https://sxyland.com/page/{page}/?s={q}"
+        actor_slug = slugify(query)
+        if not actor_slug:
+            return
+        listing = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
         try:
-            r = browser_get(url, timeout=30)
+            r = browser_get(listing, timeout=self._timeout)
         except Exception as e:
-            log.warning("sxyland search fetch failed: %s", e)
+            log.warning("sxyland actor-page fetch failed (%s): %s", listing, e)
             return
         if r.status_code != 200:
+            log.debug("sxyland %s status=%d", listing, r.status_code)
             return
 
-        query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
-
+        scene_urls: list[str] = []
         seen: set[str] = set()
-        yielded = 0
         for m in _SCENE_URL_RE.finditer(r.text):
-            scene_url = m.group(1) + "/"
-            slug = m.group(3)
-            if scene_url in seen:
+            slug = m.group(1)
+            if slug in _NAV_SLUGS or slug in seen:
                 continue
-            seen.add(scene_url)
+            seen.add(slug)
+            scene_urls.append(f"{_BASE}/{slug}/")
 
-            slug_lower = slug.lower()
-            if query_tokens and not any(tok in slug_lower for tok in query_tokens):
+        yielded = 0
+        for scene_url in scene_urls:
+            scene = self._parse_scene(scene_url, query)
+            if scene is None:
                 continue
-
-            title = slug.replace("-", " ").strip()
-
-            yield RawScene(
-                external_id=f"sxylandcom:{scene_url}",
-                title=title,
-                url=scene_url,
-                playback_sources=[
-                    RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url)
-                ],
-                performers=[RawPerformer(name=query.strip())],
-                raw={
-                    "source": "direct_scraper:sxyland",
-                    "query": query,
-                    "page": page,
-                    "url": scene_url,
-                },
-            )
+            yield scene
             yielded += 1
             if limit is not None and yielded >= limit:
                 return
+
+    def _parse_scene(self, scene_url: str, query: str) -> RawScene | None:
+        try:
+            r = browser_get(scene_url, timeout=self._timeout)
+            if r.status_code != 200:
+                return None
+            detail = r.text
+        except Exception as e:
+            log.info("sxyland scene fetch failed %s: %s", scene_url, e)
+            return None
+
+        title = _OGTITLE_RE.search(detail)
+        title_s = html.unescape(title.group(1)).strip() if title else ""
+        if not title_s:
+            return None
+
+        dm = _DURATION_RE.search(detail)
+        duration_sec = _parse_iso_duration(dm.group(1)) if dm else None
+        um = _UPLOADDATE_RE.search(detail)
+        release_date = _parse_date(um.group(1)) if um else None
+
+        # Performerzy: wszystkie /actor/ linki (z co-starami).
+        performers: list[RawPerformer] = []
+        seen_perf: set[str] = set()
+        for m in _ACTOR_LINK_RE.finditer(detail):
+            name = html.unescape(m.group(1)).strip()
+            sl = slugify(name)
+            if not sl or sl in seen_perf:
+                continue
+            seen_perf.add(sl)
+            performers.append(
+                RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
+            )
+        if not performers:
+            # Fallback: query (jesteśmy na /actor/<query>/, więc to na pewno ona).
+            performers.append(
+                RawPerformer(
+                    external_id=f"{self.sitetag}:performer:{slugify(query)}",
+                    name=query.strip(),
+                )
+            )
+
+        # Tagi — TYLKO z bloku tagów sceny (nie z sidebara/popular widgetu).
+        tags: list[RawTag] = []
+        seen_tag: set[str] = set()
+        block_m = _TAGS_BLOCK_RE.search(detail)
+        tags_html = block_m.group(1) if block_m else ""
+        for m in _TAG_LINK_RE.finditer(tags_html):
+            name = html.unescape(m.group(1)).strip()
+            sl = slugify(name)
+            if not sl or sl in seen_tag:
+                continue
+            seen_tag.add(sl)
+            tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
+
+        # Guard "to realna scena wideo": nav/legal pages (Terms of Use itp.) mają
+        # sidebar z aktorami (fałszywi performerzy) ale ZERO duration i ZERO tagów.
+        if duration_sec is None and not tags:
+            return None
+
+        studio = _studio_from_title(title_s, performers)
+
+        return RawScene(
+            external_id=f"{self.sitetag}:{scene_url}",
+            title=title_s,
+            duration_sec=duration_sec,
+            release_date=release_date,
+            url=scene_url,
+            studio=studio,
+            performers=performers,
+            tags=tags,
+            playback_sources=[
+                RawPlaybackSource(
+                    origin=f"tube:{self.sitetag}",
+                    page_url=scene_url,
+                    duration_sec=duration_sec,
+                )
+            ],
+        )