Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
91 lines
3.5 KiB
Python
91 lines
3.5 KiB
Python
"""Universal duration extractor for tube pages.
|
|
|
|
Direct scrapery (xvideos, xnxx, youporn, porntrex, …) są search-only — pobierają
|
|
listing i wycioskują tylko URL + slug-as-title. Duration pojawia się dopiero na
|
|
detail page i jest dostępne w jednym z patternów:
|
|
|
|
1. **OpenGraph numeric** (youporn, redtube, eporner):
|
|
`<meta property="og:video:duration" content="992">` — sekundy.
|
|
2. **OpenGraph ISO 8601** (rzadkie):
|
|
`<meta property="og:video:duration" content="PT16M32S">`.
|
|
3. **Schema.org VideoObject LD-JSON** (xvideos, xnxx, KVS-based):
|
|
`"duration": "PT00H07M10S"` w JSON-LD `<script type="application/ld+json">`.
|
|
4. **itemprop microdata** (sxyland, 0dayxx, niektóre WordPress):
|
|
`<meta itemprop="duration" content="P0DT0H21M13S">` — ISO 8601 z opcjonalnym
|
|
`P<days>D` prefix + opcjonalnym `T` blokiem HMS.
|
|
|
|
Funkcja zwraca pierwszy znaleziony match jako int seconds, lub None.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
_OG_DURATION_RE = re.compile(
|
|
r'<meta\s+property="(?:og:(?:video:)?|video:)duration"\s+content="([^"]+)"',
|
|
re.IGNORECASE,
|
|
)
|
|
_LD_DURATION_RE = re.compile(r'"duration"\s*:\s*"(P[0-9DTHMS]+)"', re.IGNORECASE)
|
|
_ITEMPROP_DURATION_RE = re.compile(
|
|
r'itemprop="duration"[^>]*content="([^"]+)"', re.IGNORECASE
|
|
)
|
|
# Hqporner-style meta description: "Video duration is 6min 55sec" lub "1h 23min 5sec".
|
|
# Generic — pasuje też do innych tube'ów które dorzucają w meta opis duration prozą.
|
|
_META_DESC_DURATION_RE = re.compile(
|
|
r'(?:duration\s+is\s+|<meta\s+name="description"\s+content="[^"]*duration\s+is\s+)'
|
|
r'(?:(\d+)\s*h(?:our)?s?)?\s*(?:(\d+)\s*min)?\s*(?:(\d+)\s*sec)?',
|
|
re.IGNORECASE,
|
|
)
|
|
# Generalized ISO 8601: P[<n>D][T[<n>H][<n>M][<n>S]]. Pokrywa `PT16M32S`,
|
|
# `PT00H07M10S`, `P0DT0H21M13S` jednocześnie. Dni są rzadko sensowne (>24h scena),
|
|
# ale zachowujemy bo niektóre tube'y wpisują P0D dla porządku.
|
|
_ISO_DURATION_RE = re.compile(
|
|
r"^P(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$", re.IGNORECASE
|
|
)
|
|
|
|
|
|
def _parse_iso8601(value: str) -> int | None:
|
|
"""`P0DT0H21M13S` → 1273, `PT00H07M10S` → 430. None gdy format niepasujący
|
|
LUB total == 0 (sygnał placeholder bez duration)."""
|
|
m = _ISO_DURATION_RE.match(value.strip())
|
|
if not m:
|
|
return None
|
|
d, h, mi, s = (int(g) if g else 0 for g in m.groups())
|
|
total = d * 86400 + h * 3600 + mi * 60 + s
|
|
return total if total > 0 else None
|
|
|
|
|
|
def extract_duration_sec(html: str) -> int | None:
|
|
"""Zwraca duration w sekundach lub None gdy żaden wzorzec nie pasuje.
|
|
|
|
Kolejność: OG numeric → OG ISO → LD-JSON ISO → itemprop ISO. Pierwsze pasujące
|
|
z `total > 0` wygrywa.
|
|
"""
|
|
if not html:
|
|
return None
|
|
|
|
if (m := _OG_DURATION_RE.search(html)):
|
|
v = m.group(1).strip()
|
|
if v.isdigit():
|
|
n = int(v)
|
|
if n > 0:
|
|
return n
|
|
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
|
|
return parsed
|
|
|
|
if (m := _LD_DURATION_RE.search(html)):
|
|
if (parsed := _parse_iso8601(m.group(1))) is not None:
|
|
return parsed
|
|
|
|
if (m := _ITEMPROP_DURATION_RE.search(html)):
|
|
v = m.group(1).strip()
|
|
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
|
|
return parsed
|
|
|
|
# Hqporner: "Video duration is 6min 55sec" w meta description.
|
|
if (m := _META_DESC_DURATION_RE.search(html)):
|
|
h, mi, s = (int(g) if g else 0 for g in m.groups())
|
|
total = h * 3600 + mi * 60 + s
|
|
if total > 0:
|
|
return total
|
|
|
|
return None
|