goon/app/extractors/thumb_extract.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

81 lines
2.8 KiB
Python

"""Universal thumbnail URL extractor for tube pages.
Direct scrapery (search-only) zwracają RawScene z thumbnail_url=None dla większości
tube'ów (xnxx, hdporn92, sxyland, sxyprn). Detail page zawiera URL miniatury w
jednym z patternów:
1. **OpenGraph** (najbardziej powszechne):
`<meta property="og:image" content="https://cdn.../thumb.jpg">`
2. **Twitter Card** (fallback gdy og:image brak):
`<meta name="twitter:image" content="...">`
3. **Schema.org VideoObject LD-JSON**:
`"thumbnailUrl": "https://..."` lub `"thumbnailUrl": ["url1", "url2"]`
4. **html5player** (KVS-based — xnxx/xvideos):
`html5player.setThumbUrl('https://thumb-cdn77.xnxx-cdn.com/.../t.jpg')`
Funkcja zwraca pierwszy znaleziony URL (string), lub None gdy żaden nie pasuje.
"""
from __future__ import annotations
import re
_OG_IMAGE_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
_TWITTER_IMAGE_RE = re.compile(
r'<meta\s+name="twitter:image(?::src)?"\s+content="([^"]+)"', re.IGNORECASE
)
_LD_THUMB_RE = re.compile(
r'"thumbnailUrl"\s*:\s*(?:"([^"]+)"|\[\s*"([^"]+)")', re.IGNORECASE
)
_KVS_THUMB_RE = re.compile(
r"setThumbUrl\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE
)
# hqporner: stare WordPress bez og:image / schema. Poster `_main.jpg` siedzi w
# atrybucie onclick (`changeImage('//cdn/_main.jpg', 'cover_<id>')`) lub w preload
# array (`preload_<id> = ['//.../_1.jpg', ...]`). Wyciągamy `_main.jpg` z CDN-a
# fastporndelivery.hqporner.com — to canonical poster, frames `_1..N.jpg` to
# hover animation.
_HQPORNER_THUMB_RE = re.compile(
r"['\"](//[a-z0-9.\-]*hqporner[a-z0-9.\-]*/imgs/[^'\"]+_main\.jpg)['\"]",
re.IGNORECASE,
)
def extract_thumbnail_url(html: str) -> str | None:
"""Zwraca pierwszą znalezioną miniaturkę URL lub None.
Kolejność: OG → Twitter → LD-JSON → KVS html5player. og:image jest
najpopularniejszy (większość WordPress + KVS-based tubes); pozostałe to
fallback dla niszowych templatek.
"""
if not html:
return None
if (m := _OG_IMAGE_RE.search(html)):
url = m.group(1).strip()
if url and not url.startswith("data:"):
return url
if (m := _TWITTER_IMAGE_RE.search(html)):
url = m.group(1).strip()
if url and not url.startswith("data:"):
return url
if (m := _LD_THUMB_RE.search(html)):
url = (m.group(1) or m.group(2) or "").strip()
if url:
return url
if (m := _KVS_THUMB_RE.search(html)):
url = m.group(1).strip()
if url:
return url
if (m := _HQPORNER_THUMB_RE.search(html)):
url = m.group(1).strip()
if url:
# Protocol-relative `//cdn/path` → https.
return "https:" + url if url.startswith("//") else url
return None