diff --git a/app/connectors/direct_scrapers/sxyprn.py b/app/connectors/direct_scrapers/sxyprn.py index 71e86d6..f161f34 100644 --- a/app/connectors/direct_scrapers/sxyprn.py +++ b/app/connectors/direct_scrapers/sxyprn.py @@ -1,24 +1,193 @@ -"""sxyprn.com — direct HTML scrape search results. +"""sxyprn.com — performer-page scrape (search-based, performer-driven). -Sxyprn search jest oparte na `?type=videos&query=` GET endpoint który zwraca -HTML strony z linkami. Scene URL format: `https://sxyprn.com/post/.html`. +2026-06-16 fix (zamrożony od 05-07): sxyprn NIE ma free-text searcha. Stary endpoint +`?type=videos&query=` oddawał TRENDING (0 trafień dla performera → strict token +filtr słusznie wycinał wszystko → 0 ingestu → freshness zamrożona). Realny "search" +to **strona performera** `https://sxyprn.com/.html` (spacje → myślniki), +zwraca ~30 kart scen tego performera. -Page'owanie sxyprn niespójne — często single-page results dla query (~24 wyników). +Bonus tej ścieżki: w trybie performer-driven query = czysta nazwa performera, więc +omijamy fragmentację sxyprn (post taguje "Dallas Rae" jako osobne `/Dallas.html` + +`/Rae.html` / `aria-label='Dallas,Rae'`). Performera bierzemy z query (po potwierdzeniu +token-filtrem na tytule), a NIE z połamanych tagów. + +Metadane z karty (bez +1 fetch detalu): + - URL sceny: `/post/.html` + - duration: `MM:SS|HH:MM:SS` + - studio: `Channel` (sxyprn "channel"; ~⅔ kart, opcjonalny) + - title + tagi: `` + - thumbnail: `data-src='//...small.jpg'` + +Playback resolwuje extractor `sxyprncom` (osobno; mp4 z trafficdeposit/lulustream). """ from __future__ import annotations +import html +import logging import re +from app.connectors.base import ( + RawPerformer, + RawPlaybackSource, + RawScene, + RawStudio, + RawTag, +) from app.connectors.direct_scrapers._search_base import BaseSearchScraper +from app.extractors import browser_get +from app.normalize.text import slugify + +log = logging.getLogger(__name__) + +_BASE = "https://sxyprn.com" + +# js-pop anchor karty: /post/.html(?sk=...&so=...&ss=latest na stronie performera) +# + aria-label (połamani performerzy — nieużywane). +_CARD_ANCHOR_RE = re.compile( + r"[a-f0-9]+)\.html(?:\?[^']*)?'[^>]*class='js-pop'", + re.IGNORECASE, +) +# Uwaga: duration_small ma atrybut `title='s1->c10'` ze znakiem `>` w środku, więc +# NIE używamy `[^>]*` (łamie się na tym `>`) — leniwe `.*?` do wartości HH:MM:SS. +_DURATION_RE = re.compile( + r"duration_small.*?>\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*<", re.IGNORECASE | re.DOTALL +) +_SUBCAT_RE = re.compile(r"post_el_small_subcat[^>]*>([^<]+)<", re.IGNORECASE) +_THUMB_RE = re.compile(r"data-src='(//[^']+?small\.jpg)'", re.IGNORECASE) +# post_time z tytułem — keyowane po ID, bo leży w post_control PO vid_container. +_POSTTIME_RE = re.compile( + r"post_time'[^>]*href='/post/{id}\.html(?:\?[^']*)?'[^>]*title='([^']*)'", + re.IGNORECASE, +) +_HASHTAG_RE = re.compile(r"#(\w[\w-]*)") +_BRACE_RE = re.compile(r"\{[^}]*\}") +_URL_TAIL_RE = re.compile(r"https?://\S+") + + +def _parse_duration(s: str) -> int | None: + parts = s.strip().split(":") + try: + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + if len(parts) == 3: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + except ValueError: + return None + return None + + +def _clean_title(raw: str) -> tuple[str, list[str]]: + """Zwraca (czysty tytuł, tagi). Usuwa {markery}, hashtagi i końcowy URL.""" + text = html.unescape(raw) + tags = [m.group(1) for m in _HASHTAG_RE.finditer(text)] + text = _BRACE_RE.sub(" ", text) # {New}, {Watch At 1080P}, {WATCH...} + text = _URL_TAIL_RE.sub(" ", text) # końcowy link do hostera + text = _HASHTAG_RE.sub(" ", text) # #Tag + text = re.sub(r"\s+", " ", text).strip(" :-") + return text, tags class SxyPrnScraper(BaseSearchScraper): sitetag = "sxyprncom" - _search_url_template = "https://sxyprn.com/?type=videos&query={query}&page={page}" - _scene_url_re = re.compile( - r'href="(?P/post/(?P[a-z0-9]+))\.html"', - ) + # _search_url_template/_scene_url_re z bazy nieużywane — mamy custom search(). + _card_window = 2600 # od js-pop anchora do post_control z tytułem - def _title_from_slug(self, slug: str) -> str: - # sxyprn post ID to nieczytelny hash — placeholder, title backfill przy resolve. - return f"sxyprn:{slug}" + def _performer_path(self, query: str) -> str: + """`Lana Rhoades` → `Lana-Rhoades` (strona performera sxyprn).""" + cleaned = re.sub(r"[^A-Za-z0-9 ]+", "", query).strip() + return re.sub(r"\s+", "-", cleaned) + + def search(self, query, *, page: int = 1, limit=None): + path = self._performer_path(query) + if not path: + return + url = f"{_BASE}/{path}.html" + (f"?page={page}" if page > 1 else "") + try: + r = browser_get(url, timeout=self._timeout) + except Exception as e: + log.warning("sxyprn search fetch failed (%s): %s", url, e) + return + if r.status_code != 200: + log.debug("sxyprn search %s status=%d", url, r.status_code) + return + + text = r.text + query_tokens = { + tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len + } + anchors = list(_CARD_ANCHOR_RE.finditer(text)) + seen: set[str] = set() + yielded = 0 + for idx, m in enumerate(anchors): + post_id = m.group("id") + if post_id in seen: + continue + seen.add(post_id) + win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + self._card_window + window = text[m.start():win_end] + + tm = re.search(_POSTTIME_RE.pattern.format(id=re.escape(post_id)), text, re.IGNORECASE) + if not tm: + continue + title, tags = _clean_title(tm.group(1)) + title_l = title.lower() + # Strict: scena musi realnie dotyczyć performera (oba tokeny w tytule) — + # chroni przed śmieciem na stronie i przed over-attribution. + if query_tokens and not all(tok in title_l for tok in query_tokens): + continue + if not title: + continue + + scene_url = f"{_BASE}/post/{post_id}.html" + dur_m = _DURATION_RE.search(window) + duration_sec = _parse_duration(dur_m.group(1)) if dur_m else None + thumb_m = _THUMB_RE.search(window) + thumb = thumb_m.group(1) if thumb_m else None + if thumb and thumb.startswith("//"): + thumb = "https:" + thumb + + studio = None + sub_m = _SUBCAT_RE.search(window) + if sub_m: + name = html.unescape(sub_m.group(1)).strip() + if name and name.lower() not in ("all", "trending"): + studio = RawStudio( + external_id=f"{self.sitetag}:studio:{slugify(name)}", + name=name, + slug=slugify(name), + ) + + tag_objs: list[RawTag] = [] + seen_tag: set[str] = set() + for t in tags: + sl = slugify(t) + if not sl or sl in seen_tag: + continue + seen_tag.add(sl) + tag_objs.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=t, slug=sl)) + + performer = RawPerformer( + external_id=f"{self.sitetag}:performer:{slugify(query)}", + name=query.strip(), + ) + + yield RawScene( + external_id=f"{self.sitetag}:{post_id}", + title=title, + duration_sec=duration_sec, + url=scene_url, + studio=studio, + performers=[performer], + tags=tag_objs, + playback_sources=[ + RawPlaybackSource( + origin=f"tube:{self.sitetag}", + page_url=scene_url, + duration_sec=duration_sec, + thumbnail_url=thumb, + ) + ], + ) + yielded += 1 + if limit and yielded >= limit: + return