Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
81 lines
2.8 KiB
Python
81 lines
2.8 KiB
Python
"""Universal thumbnail URL extractor for tube pages.
|
|
|
|
Direct scrapery (search-only) zwracają RawScene z thumbnail_url=None dla większości
|
|
tube'ów (xnxx, hdporn92, sxyland, sxyprn). Detail page zawiera URL miniatury w
|
|
jednym z patternów:
|
|
|
|
1. **OpenGraph** (najbardziej powszechne):
|
|
`<meta property="og:image" content="https://cdn.../thumb.jpg">`
|
|
2. **Twitter Card** (fallback gdy og:image brak):
|
|
`<meta name="twitter:image" content="...">`
|
|
3. **Schema.org VideoObject LD-JSON**:
|
|
`"thumbnailUrl": "https://..."` lub `"thumbnailUrl": ["url1", "url2"]`
|
|
4. **html5player** (KVS-based — xnxx/xvideos):
|
|
`html5player.setThumbUrl('https://thumb-cdn77.xnxx-cdn.com/.../t.jpg')`
|
|
|
|
Funkcja zwraca pierwszy znaleziony URL (string), lub None gdy żaden nie pasuje.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
_OG_IMAGE_RE = re.compile(
|
|
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
|
)
|
|
_TWITTER_IMAGE_RE = re.compile(
|
|
r'<meta\s+name="twitter:image(?::src)?"\s+content="([^"]+)"', re.IGNORECASE
|
|
)
|
|
_LD_THUMB_RE = re.compile(
|
|
r'"thumbnailUrl"\s*:\s*(?:"([^"]+)"|\[\s*"([^"]+)")', re.IGNORECASE
|
|
)
|
|
_KVS_THUMB_RE = re.compile(
|
|
r"setThumbUrl\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE
|
|
)
|
|
# hqporner: stare WordPress bez og:image / schema. Poster `_main.jpg` siedzi w
|
|
# atrybucie onclick (`changeImage('//cdn/_main.jpg', 'cover_<id>')`) lub w preload
|
|
# array (`preload_<id> = ['//.../_1.jpg', ...]`). Wyciągamy `_main.jpg` z CDN-a
|
|
# fastporndelivery.hqporner.com — to canonical poster, frames `_1..N.jpg` to
|
|
# hover animation.
|
|
_HQPORNER_THUMB_RE = re.compile(
|
|
r"['\"](//[a-z0-9.\-]*hqporner[a-z0-9.\-]*/imgs/[^'\"]+_main\.jpg)['\"]",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def extract_thumbnail_url(html: str) -> str | None:
|
|
"""Zwraca pierwszą znalezioną miniaturkę URL lub None.
|
|
|
|
Kolejność: OG → Twitter → LD-JSON → KVS html5player. og:image jest
|
|
najpopularniejszy (większość WordPress + KVS-based tubes); pozostałe to
|
|
fallback dla niszowych templatek.
|
|
"""
|
|
if not html:
|
|
return None
|
|
|
|
if (m := _OG_IMAGE_RE.search(html)):
|
|
url = m.group(1).strip()
|
|
if url and not url.startswith("data:"):
|
|
return url
|
|
|
|
if (m := _TWITTER_IMAGE_RE.search(html)):
|
|
url = m.group(1).strip()
|
|
if url and not url.startswith("data:"):
|
|
return url
|
|
|
|
if (m := _LD_THUMB_RE.search(html)):
|
|
url = (m.group(1) or m.group(2) or "").strip()
|
|
if url:
|
|
return url
|
|
|
|
if (m := _KVS_THUMB_RE.search(html)):
|
|
url = m.group(1).strip()
|
|
if url:
|
|
return url
|
|
|
|
if (m := _HQPORNER_THUMB_RE.search(html)):
|
|
url = m.group(1).strip()
|
|
if url:
|
|
# Protocol-relative `//cdn/path` → https.
|
|
return "https:" + url if url.startswith("//") else url
|
|
|
|
return None
|