Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
183 lines
7.1 KiB
Python
183 lines
7.1 KiB
Python
"""shyfap.net — latest-vids browse scraper.
|
|
|
|
Browse-only (nie search-driven). Sitetag `shyfapnet`. Bogata metadata na detail
|
|
page'u (meta tags + body links): title, studio, performers, tags, duration,
|
|
description, upload_date, embed_url.
|
|
|
|
Pierwszy pilot scrapera browse-mode (2026-05-12) — weryfikacja czy detail-page
|
|
metadata wystarcza do canonical match >5%. Jeśli tak → rozszerzamy o porn00,
|
|
fullmovies, pornxp, freshporno, 4k69, hdporn.gg.
|
|
|
|
URL patterns:
|
|
- Listing: `/videos_1/` (page 1), `/videos_1/<n>/` (page 2+)
|
|
- Scene: `/video/<slug>_v<id>/`
|
|
- Embed: `/embed/<id>` (z og:video meta)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import date, datetime
|
|
from urllib.parse import urljoin
|
|
|
|
from app.connectors.base import RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
meta_content,
|
|
)
|
|
|
|
_BASE = "https://www.shyfap.net"
|
|
_SCENE_URL_RE = re.compile(r'href="(/video/[a-z0-9\-]+_v\d+/)"', re.IGNORECASE)
|
|
_STUDIO_LINK_RE = re.compile(
|
|
r'href="/studio/([a-z0-9\-]+)_s(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_PORNSTAR_LINK_RE = re.compile(
|
|
r'href="/pornstar/([a-z0-9\-]+)_p(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_TAG_LINK_RE = re.compile(
|
|
r'href="/tag/([a-z0-9\-]+)_t(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
# /video/<slug>_v<id>/ — id z URL używamy jako stable internal ID (np. w external_id),
|
|
# nie z meta `ya:ovs:id` żeby uniknąć rozjazdu meta vs URL.
|
|
_INTERNAL_ID_RE = re.compile(r"_v(\d+)/?$", re.IGNORECASE)
|
|
|
|
|
|
class ShyfapScraper(BaseBrowseScraper):
|
|
sitetag = "shyfapnet"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
# page 1 → /videos_1/, page 2 → /videos_1/2/ (shyfap quirk — sufiks `_1`
|
|
# zawsze, dodatkowy `/N/` dla pagination)
|
|
if page <= 1:
|
|
return f"{_BASE}/videos_1/"
|
|
return f"{_BASE}/videos_1/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
rel = m.group(1)
|
|
if rel in seen:
|
|
continue
|
|
seen.add(rel)
|
|
out.append(urljoin(_BASE, rel))
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
# Title from og:title (fallback do <title> regex)
|
|
title = meta_content(detail_html, property="og:title")
|
|
if not title:
|
|
m = re.search(r"<title>([^<|]+)(?:\s*[-|])", detail_html, re.IGNORECASE)
|
|
if m:
|
|
title = m.group(1).strip()
|
|
if not title:
|
|
return None
|
|
|
|
description = meta_content(detail_html, property="og:description") or meta_content(
|
|
detail_html, name="description"
|
|
)
|
|
|
|
# Duration: <meta property="video:duration" content="2436"> (seconds)
|
|
duration_sec: int | None = None
|
|
dur_str = meta_content(detail_html, property="video:duration")
|
|
if dur_str and dur_str.isdigit():
|
|
duration_sec = int(dur_str)
|
|
|
|
# Upload date: <meta property="ya:ovs:upload_date" content="2021-12-07T09:07:11+03:00">
|
|
# To upload date do shyfap, NIE prawdziwa data release sceny. Jednak lepsza niż None
|
|
# bo zwykle uploaduje się w ciągu dni od release studia → dla date_proximity w
|
|
# resolverze (window 7 dni) zwykle wystarczy do match.
|
|
release_date: date | None = None
|
|
upload_str = meta_content(detail_html, property="ya:ovs:upload_date")
|
|
if upload_str:
|
|
try:
|
|
release_date = datetime.fromisoformat(upload_str).date()
|
|
except ValueError:
|
|
pass
|
|
|
|
# Thumbnail: og:image
|
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
|
|
|
# Internal ID z URL → external_id stabilny + embed URL fallback
|
|
internal_id: str | None = None
|
|
m = _INTERNAL_ID_RE.search(scene_url)
|
|
if m:
|
|
internal_id = m.group(1)
|
|
# Embed URL: og:video (zwykle /embed/<id>)
|
|
embed_url = meta_content(detail_html, property="og:video")
|
|
if not embed_url and internal_id:
|
|
embed_url = f"{_BASE}/embed/{internal_id}"
|
|
|
|
# Studio — pierwszy `/studio/<slug>_s<id>/` link na stronie
|
|
studio: RawStudio | None = None
|
|
m_studio = _STUDIO_LINK_RE.search(detail_html)
|
|
if m_studio:
|
|
slug, sid, name = m_studio.group(1), m_studio.group(2), m_studio.group(3).strip()
|
|
studio = RawStudio(
|
|
external_id=f"shyfapnet:studio:{sid}",
|
|
name=name,
|
|
slug=slug,
|
|
)
|
|
|
|
# Performers — wszyscy `/pornstar/<slug>_p<id>/` (zwykle 1-3 per scena)
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for m_p in _PORNSTAR_LINK_RE.finditer(detail_html):
|
|
slug, pid, name = m_p.group(1), m_p.group(2), m_p.group(3).strip()
|
|
if pid in seen_perf:
|
|
continue
|
|
seen_perf.add(pid)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"shyfapnet:performer:{pid}",
|
|
name=name,
|
|
)
|
|
)
|
|
|
|
# Tags — wszystkie `/tag/<slug>_t<id>/` (zwykle 10-25 per scena)
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m_t in _TAG_LINK_RE.finditer(detail_html):
|
|
slug, tid, name = m_t.group(1), m_t.group(2), m_t.group(3).strip()
|
|
if tid in seen_tag:
|
|
continue
|
|
seen_tag.add(tid)
|
|
tags.append(
|
|
RawTag(external_id=f"shyfapnet:tag:{tid}", name=name, slug=slug)
|
|
)
|
|
|
|
# Playback source — embed_url (mobile WebView fallback). Stream extraction
|
|
# przez app/extractors/__init__.py wymaga osobnego registry entry — dla
|
|
# pilot scrapera zostawiamy embed-only (WebView), direct mp4 to follow-up.
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
embed_url=embed_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
# Perceptual hash z thumbnail. Resolver Path 3 (find_by_phash_within,
|
|
# Hamming ≤5) auto-merguje gdy TPDB/StashDB ma fingerprint tej samej sceny.
|
|
# Niezależne od shyfap title-rebrandingu — bierze się z frame'u sceny.
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumbnail_url:
|
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
description=description,
|
|
duration_sec=duration_sec,
|
|
release_date=release_date,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|