"""shyfap.net — latest-vids browse scraper. Browse-only (nie search-driven). Sitetag `shyfapnet`. Bogata metadata na detail page'u (meta tags + body links): title, studio, performers, tags, duration, description, upload_date, embed_url. Pierwszy pilot scrapera browse-mode (2026-05-12) — weryfikacja czy detail-page metadata wystarcza do canonical match >5%. Jeśli tak → rozszerzamy o porn00, fullmovies, pornxp, freshporno, 4k69, hdporn.gg. URL patterns: - Listing: `/videos_1/` (page 1), `/videos_1//` (page 2+) - Scene: `/video/_v/` - Embed: `/embed/` (z og:video meta) """ from __future__ import annotations import re from datetime import date, datetime from urllib.parse import urljoin from app.connectors.base import RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag from app.connectors.direct_scrapers._browse_base import ( BaseBrowseScraper, compute_thumbnail_phash, meta_content, ) _BASE = "https://www.shyfap.net" _SCENE_URL_RE = re.compile(r'href="(/video/[a-z0-9\-]+_v\d+/)"', re.IGNORECASE) _STUDIO_LINK_RE = re.compile( r'href="/studio/([a-z0-9\-]+)_s(\d+)/"[^>]*>([^<]+)', re.IGNORECASE ) _PORNSTAR_LINK_RE = re.compile( r'href="/pornstar/([a-z0-9\-]+)_p(\d+)/"[^>]*>([^<]+)', re.IGNORECASE ) _TAG_LINK_RE = re.compile( r'href="/tag/([a-z0-9\-]+)_t(\d+)/"[^>]*>([^<]+)', re.IGNORECASE ) # /video/_v/ — id z URL używamy jako stable internal ID (np. w external_id), # nie z meta `ya:ovs:id` żeby uniknąć rozjazdu meta vs URL. _INTERNAL_ID_RE = re.compile(r"_v(\d+)/?$", re.IGNORECASE) class ShyfapScraper(BaseBrowseScraper): sitetag = "shyfapnet" def _listing_url(self, page: int) -> str: # page 1 → /videos_1/, page 2 → /videos_1/2/ (shyfap quirk — sufiks `_1` # zawsze, dodatkowy `/N/` dla pagination) if page <= 1: return f"{_BASE}/videos_1/" return f"{_BASE}/videos_1/{page}/" def _extract_scene_urls(self, listing_html: str) -> list[str]: seen: set[str] = set() out: list[str] = [] for m in _SCENE_URL_RE.finditer(listing_html): rel = m.group(1) if rel in seen: continue seen.add(rel) out.append(urljoin(_BASE, rel)) return out def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None: # Title from og:title (fallback do regex) title = meta_content(detail_html, property="og:title") if not title: m = re.search(r"<title>([^<|]+)(?:\s*[-|])", detail_html, re.IGNORECASE) if m: title = m.group(1).strip() if not title: return None description = meta_content(detail_html, property="og:description") or meta_content( detail_html, name="description" ) # Duration: <meta property="video:duration" content="2436"> (seconds) duration_sec: int | None = None dur_str = meta_content(detail_html, property="video:duration") if dur_str and dur_str.isdigit(): duration_sec = int(dur_str) # Upload date: <meta property="ya:ovs:upload_date" content="2021-12-07T09:07:11+03:00"> # To upload date do shyfap, NIE prawdziwa data release sceny. Jednak lepsza niż None # bo zwykle uploaduje się w ciągu dni od release studia → dla date_proximity w # resolverze (window 7 dni) zwykle wystarczy do match. release_date: date | None = None upload_str = meta_content(detail_html, property="ya:ovs:upload_date") if upload_str: try: release_date = datetime.fromisoformat(upload_str).date() except ValueError: pass # Thumbnail: og:image thumbnail_url = meta_content(detail_html, property="og:image") # Internal ID z URL → external_id stabilny + embed URL fallback internal_id: str | None = None m = _INTERNAL_ID_RE.search(scene_url) if m: internal_id = m.group(1) # Embed URL: og:video (zwykle /embed/<id>) embed_url = meta_content(detail_html, property="og:video") if not embed_url and internal_id: embed_url = f"{_BASE}/embed/{internal_id}" # Studio — pierwszy `/studio/<slug>_s<id>/` link na stronie studio: RawStudio | None = None m_studio = _STUDIO_LINK_RE.search(detail_html) if m_studio: slug, sid, name = m_studio.group(1), m_studio.group(2), m_studio.group(3).strip() studio = RawStudio( external_id=f"shyfapnet:studio:{sid}", name=name, slug=slug, ) # Performers — wszyscy `/pornstar/<slug>_p<id>/` (zwykle 1-3 per scena) performers: list[RawPerformer] = [] seen_perf: set[str] = set() for m_p in _PORNSTAR_LINK_RE.finditer(detail_html): slug, pid, name = m_p.group(1), m_p.group(2), m_p.group(3).strip() if pid in seen_perf: continue seen_perf.add(pid) performers.append( RawPerformer( external_id=f"shyfapnet:performer:{pid}", name=name, ) ) # Tags — wszystkie `/tag/<slug>_t<id>/` (zwykle 10-25 per scena) tags: list[RawTag] = [] seen_tag: set[str] = set() for m_t in _TAG_LINK_RE.finditer(detail_html): slug, tid, name = m_t.group(1), m_t.group(2), m_t.group(3).strip() if tid in seen_tag: continue seen_tag.add(tid) tags.append( RawTag(external_id=f"shyfapnet:tag:{tid}", name=name, slug=slug) ) # Playback source — embed_url (mobile WebView fallback). Stream extraction # przez app/extractors/__init__.py wymaga osobnego registry entry — dla # pilot scrapera zostawiamy embed-only (WebView), direct mp4 to follow-up. playback_sources = [ RawPlaybackSource( origin=f"tube:{self.sitetag}", page_url=scene_url, embed_url=embed_url, duration_sec=duration_sec, thumbnail_url=thumbnail_url, ) ] # Perceptual hash z thumbnail. Resolver Path 3 (find_by_phash_within, # Hamming ≤5) auto-merguje gdy TPDB/StashDB ma fingerprint tej samej sceny. # Niezależne od shyfap title-rebrandingu — bierze się z frame'u sceny. fingerprints: list[RawFingerprint] = [] if thumbnail_url: ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/") if ph: fingerprints.append(RawFingerprint(kind="phash", value=ph)) return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, release_date=release_date, url=scene_url, studio=studio, performers=performers, tags=tags, fingerprints=fingerprints, playback_sources=playback_sources, )