fix(sxyprn): revive search via performer pages + rich metadata
sxyprn ingest was frozen since 05-07: the old ?type=videos&query= endpoint returns trending (not performer-filtered), so the strict token filter correctly dropped everything -> 0 ingest. Real "search" is the performer page /<First-Last>.html. Rewrote search() to scrape those cards: clean performer (the query, avoids sxyprn's Dallas/Rae name fragmentation), studio (channel subcat), tags (#hashtags), duration, thumbnail. Token filter now runs on the card title so only genuine matches attach the performer. Verified: Lana Rhoades/Riley Reid/Angela White return results, metadata persists in DB (studio e.g. Vixen, 10-31 tags/scene), playback mp4 206. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
00f4779abe
commit
e0e69189a8
1 changed files with 180 additions and 11 deletions
|
|
@ -1,24 +1,193 @@
|
|||
"""sxyprn.com — direct HTML scrape search results.
|
||||
"""sxyprn.com — performer-page scrape (search-based, performer-driven).
|
||||
|
||||
Sxyprn search jest oparte na `?type=videos&query=<q>` GET endpoint który zwraca
|
||||
HTML strony z linkami. Scene URL format: `https://sxyprn.com/post/<post_id>.html`.
|
||||
2026-06-16 fix (zamrożony od 05-07): sxyprn NIE ma free-text searcha. Stary endpoint
|
||||
`?type=videos&query=<q>` oddawał TRENDING (0 trafień dla performera → strict token
|
||||
filtr słusznie wycinał wszystko → 0 ingestu → freshness zamrożona). Realny "search"
|
||||
to **strona performera** `https://sxyprn.com/<Imie-Nazwisko>.html` (spacje → myślniki),
|
||||
zwraca ~30 kart scen tego performera.
|
||||
|
||||
Page'owanie sxyprn niespójne — często single-page results dla query (~24 wyników).
|
||||
Bonus tej ścieżki: w trybie performer-driven query = czysta nazwa performera, więc
|
||||
omijamy fragmentację sxyprn (post taguje "Dallas Rae" jako osobne `/Dallas.html` +
|
||||
`/Rae.html` / `aria-label='Dallas,Rae'`). Performera bierzemy z query (po potwierdzeniu
|
||||
token-filtrem na tytule), a NIE z połamanych tagów.
|
||||
|
||||
Metadane z karty (bez +1 fetch detalu):
|
||||
- URL sceny: `/post/<id>.html`
|
||||
- duration: `<span class='duration_small'>MM:SS|HH:MM:SS`
|
||||
- studio: `<span class='post_el_small_subcat'>Channel` (sxyprn "channel"; ~⅔ kart, opcjonalny)
|
||||
- title + tagi: `<a class='post_time' title='{New} ...#Tag1 #Tag2... {Watch...} url'>`
|
||||
- thumbnail: `data-src='//...small.jpg'`
|
||||
|
||||
Playback resolwuje extractor `sxyprncom` (osobno; mp4 z trafficdeposit/lulustream).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
|
||||
from app.connectors.base import (
|
||||
RawPerformer,
|
||||
RawPlaybackSource,
|
||||
RawScene,
|
||||
RawStudio,
|
||||
RawTag,
|
||||
)
|
||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||
from app.extractors import browser_get
|
||||
from app.normalize.text import slugify
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://sxyprn.com"
|
||||
|
||||
# js-pop anchor karty: /post/<id>.html(?sk=...&so=...&ss=latest na stronie performera)
|
||||
# + aria-label (połamani performerzy — nieużywane).
|
||||
_CARD_ANCHOR_RE = re.compile(
|
||||
r"<a\s+href='/post/(?P<id>[a-f0-9]+)\.html(?:\?[^']*)?'[^>]*class='js-pop'",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Uwaga: duration_small ma atrybut `title='s1->c10'` ze znakiem `>` w środku, więc
|
||||
# NIE używamy `[^>]*` (łamie się na tym `>`) — leniwe `.*?` do wartości HH:MM:SS.
|
||||
_DURATION_RE = re.compile(
|
||||
r"duration_small.*?>\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*<", re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
_SUBCAT_RE = re.compile(r"post_el_small_subcat[^>]*>([^<]+)<", re.IGNORECASE)
|
||||
_THUMB_RE = re.compile(r"data-src='(//[^']+?small\.jpg)'", re.IGNORECASE)
|
||||
# post_time z tytułem — keyowane po ID, bo leży w post_control PO vid_container.
|
||||
_POSTTIME_RE = re.compile(
|
||||
r"post_time'[^>]*href='/post/{id}\.html(?:\?[^']*)?'[^>]*title='([^']*)'",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_HASHTAG_RE = re.compile(r"#(\w[\w-]*)")
|
||||
_BRACE_RE = re.compile(r"\{[^}]*\}")
|
||||
_URL_TAIL_RE = re.compile(r"https?://\S+")
|
||||
|
||||
|
||||
def _parse_duration(s: str) -> int | None:
|
||||
parts = s.strip().split(":")
|
||||
try:
|
||||
if len(parts) == 2:
|
||||
return int(parts[0]) * 60 + int(parts[1])
|
||||
if len(parts) == 3:
|
||||
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _clean_title(raw: str) -> tuple[str, list[str]]:
|
||||
"""Zwraca (czysty tytuł, tagi). Usuwa {markery}, hashtagi i końcowy URL."""
|
||||
text = html.unescape(raw)
|
||||
tags = [m.group(1) for m in _HASHTAG_RE.finditer(text)]
|
||||
text = _BRACE_RE.sub(" ", text) # {New}, {Watch At 1080P}, {WATCH...}
|
||||
text = _URL_TAIL_RE.sub(" ", text) # końcowy link do hostera
|
||||
text = _HASHTAG_RE.sub(" ", text) # #Tag
|
||||
text = re.sub(r"\s+", " ", text).strip(" :-")
|
||||
return text, tags
|
||||
|
||||
|
||||
class SxyPrnScraper(BaseSearchScraper):
|
||||
sitetag = "sxyprncom"
|
||||
_search_url_template = "https://sxyprn.com/?type=videos&query={query}&page={page}"
|
||||
_scene_url_re = re.compile(
|
||||
r'href="(?P<url>/post/(?P<slug>[a-z0-9]+))\.html"',
|
||||
)
|
||||
# _search_url_template/_scene_url_re z bazy nieużywane — mamy custom search().
|
||||
_card_window = 2600 # od js-pop anchora do post_control z tytułem
|
||||
|
||||
def _title_from_slug(self, slug: str) -> str:
|
||||
# sxyprn post ID to nieczytelny hash — placeholder, title backfill przy resolve.
|
||||
return f"sxyprn:{slug}"
|
||||
def _performer_path(self, query: str) -> str:
|
||||
"""`Lana Rhoades` → `Lana-Rhoades` (strona performera sxyprn)."""
|
||||
cleaned = re.sub(r"[^A-Za-z0-9 ]+", "", query).strip()
|
||||
return re.sub(r"\s+", "-", cleaned)
|
||||
|
||||
def search(self, query, *, page: int = 1, limit=None):
|
||||
path = self._performer_path(query)
|
||||
if not path:
|
||||
return
|
||||
url = f"{_BASE}/{path}.html" + (f"?page={page}" if page > 1 else "")
|
||||
try:
|
||||
r = browser_get(url, timeout=self._timeout)
|
||||
except Exception as e:
|
||||
log.warning("sxyprn search fetch failed (%s): %s", url, e)
|
||||
return
|
||||
if r.status_code != 200:
|
||||
log.debug("sxyprn search %s status=%d", url, r.status_code)
|
||||
return
|
||||
|
||||
text = r.text
|
||||
query_tokens = {
|
||||
tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len
|
||||
}
|
||||
anchors = list(_CARD_ANCHOR_RE.finditer(text))
|
||||
seen: set[str] = set()
|
||||
yielded = 0
|
||||
for idx, m in enumerate(anchors):
|
||||
post_id = m.group("id")
|
||||
if post_id in seen:
|
||||
continue
|
||||
seen.add(post_id)
|
||||
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + self._card_window
|
||||
window = text[m.start():win_end]
|
||||
|
||||
tm = re.search(_POSTTIME_RE.pattern.format(id=re.escape(post_id)), text, re.IGNORECASE)
|
||||
if not tm:
|
||||
continue
|
||||
title, tags = _clean_title(tm.group(1))
|
||||
title_l = title.lower()
|
||||
# Strict: scena musi realnie dotyczyć performera (oba tokeny w tytule) —
|
||||
# chroni przed śmieciem na stronie i przed over-attribution.
|
||||
if query_tokens and not all(tok in title_l for tok in query_tokens):
|
||||
continue
|
||||
if not title:
|
||||
continue
|
||||
|
||||
scene_url = f"{_BASE}/post/{post_id}.html"
|
||||
dur_m = _DURATION_RE.search(window)
|
||||
duration_sec = _parse_duration(dur_m.group(1)) if dur_m else None
|
||||
thumb_m = _THUMB_RE.search(window)
|
||||
thumb = thumb_m.group(1) if thumb_m else None
|
||||
if thumb and thumb.startswith("//"):
|
||||
thumb = "https:" + thumb
|
||||
|
||||
studio = None
|
||||
sub_m = _SUBCAT_RE.search(window)
|
||||
if sub_m:
|
||||
name = html.unescape(sub_m.group(1)).strip()
|
||||
if name and name.lower() not in ("all", "trending"):
|
||||
studio = RawStudio(
|
||||
external_id=f"{self.sitetag}:studio:{slugify(name)}",
|
||||
name=name,
|
||||
slug=slugify(name),
|
||||
)
|
||||
|
||||
tag_objs: list[RawTag] = []
|
||||
seen_tag: set[str] = set()
|
||||
for t in tags:
|
||||
sl = slugify(t)
|
||||
if not sl or sl in seen_tag:
|
||||
continue
|
||||
seen_tag.add(sl)
|
||||
tag_objs.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=t, slug=sl))
|
||||
|
||||
performer = RawPerformer(
|
||||
external_id=f"{self.sitetag}:performer:{slugify(query)}",
|
||||
name=query.strip(),
|
||||
)
|
||||
|
||||
yield RawScene(
|
||||
external_id=f"{self.sitetag}:{post_id}",
|
||||
title=title,
|
||||
duration_sec=duration_sec,
|
||||
url=scene_url,
|
||||
studio=studio,
|
||||
performers=[performer],
|
||||
tags=tag_objs,
|
||||
playback_sources=[
|
||||
RawPlaybackSource(
|
||||
origin=f"tube:{self.sitetag}",
|
||||
page_url=scene_url,
|
||||
duration_sec=duration_sec,
|
||||
thumbnail_url=thumb,
|
||||
)
|
||||
],
|
||||
)
|
||||
yielded += 1
|
||||
if limit and yielded >= limit:
|
||||
return
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue