Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
177 lines
6.2 KiB
Python
177 lines
6.2 KiB
Python
"""freshporno.org — latest-vids browse scraper.
|
|
|
|
Pilot #2 (po shyfap fail). Hipoteza: freshporno zachowuje oryginalne studio titles
|
|
("Straighten Her Out" zamiast custom rebranding jak shyfap) → title fuzzy match
|
|
do canonical zadziała. Bonus: channel = studio 1:1 (Pure Taboo, Brazzers, etc.).
|
|
|
|
URL patterns:
|
|
- Listing: `/` (page 1), `/2/`, `/3/`, ... (last `/391/` w czasie pisania)
|
|
- Scene: `/videos/<slug>/`
|
|
- Channels: `/channels/<slug>/` (= studio)
|
|
- Models: `/models/<slug>/` (= performer)
|
|
- Tags: `/tags/<slug>/` (= category)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import date, datetime, timedelta
|
|
from urllib.parse import urljoin
|
|
|
|
from app.connectors.base import (
|
|
RawFingerprint,
|
|
RawPerformer,
|
|
RawPlaybackSource,
|
|
RawScene,
|
|
RawStudio,
|
|
RawTag,
|
|
)
|
|
from app.connectors.direct_scrapers._browse_base import (
|
|
BaseBrowseScraper,
|
|
compute_thumbnail_phash,
|
|
meta_content,
|
|
)
|
|
|
|
_BASE = "https://freshporno.org"
|
|
_SCENE_URL_RE = re.compile(r'href="(https://freshporno\.org/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
|
|
_CHANNEL_LINK_RE = re.compile(
|
|
r'href="https://freshporno\.org/channels/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_MODEL_LINK_RE = re.compile(
|
|
r'href="https://freshporno\.org/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
_TAG_LINK_RE = re.compile(
|
|
r'href="https://freshporno\.org/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
|
)
|
|
# Duration via <time datetime="PT46M01S"> (ISO 8601 duration). Fallback: meta property
|
|
_TIME_DURATION_RE = re.compile(r'<time[^>]+datetime="PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"', re.IGNORECASE)
|
|
|
|
|
|
def _parse_iso_duration_to_sec(html: str) -> int | None:
|
|
m = _TIME_DURATION_RE.search(html)
|
|
if not m:
|
|
return None
|
|
h = int(m.group(1) or 0)
|
|
mn = int(m.group(2) or 0)
|
|
s = int(m.group(3) or 0)
|
|
return h * 3600 + mn * 60 + s
|
|
|
|
|
|
class FreshpornoScraper(BaseBrowseScraper):
|
|
sitetag = "freshpornoorg"
|
|
|
|
def _listing_url(self, page: int) -> str:
|
|
if page <= 1:
|
|
return f"{_BASE}/"
|
|
return f"{_BASE}/{page}/"
|
|
|
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
|
url = m.group(1)
|
|
if url in seen:
|
|
continue
|
|
seen.add(url)
|
|
out.append(url)
|
|
return out
|
|
|
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
|
title = meta_content(detail_html, property="og:title")
|
|
if not title:
|
|
m = re.search(r"<title>([^<]+)</title>", detail_html, re.IGNORECASE)
|
|
if m:
|
|
title = m.group(1).strip()
|
|
if not title:
|
|
return None
|
|
|
|
description = meta_content(detail_html, property="og:description") or meta_content(
|
|
detail_html, name="description"
|
|
)
|
|
|
|
# Duration: <meta property="video:duration"> w sekundach LUB <time datetime="PT46M01S">
|
|
duration_sec: int | None = None
|
|
dur_meta = meta_content(detail_html, property="video:duration")
|
|
if dur_meta and dur_meta.isdigit():
|
|
duration_sec = int(dur_meta)
|
|
else:
|
|
duration_sec = _parse_iso_duration_to_sec(detail_html)
|
|
|
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
|
|
|
# Channel = studio. Pierwszy `/channels/<slug>/` link na stronie body
|
|
# (top nav też ma channels list ale to inny pattern z `/channels/" zatrzymanym)
|
|
studio: RawStudio | None = None
|
|
# Skipnij nav linki ze stringiem "Channels" jako anchor text — bierzemy specific channel
|
|
for m in _CHANNEL_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if name.lower() in ("channels", ""):
|
|
continue
|
|
studio = RawStudio(
|
|
external_id=f"freshpornoorg:channel:{slug}",
|
|
name=name,
|
|
slug=slug,
|
|
)
|
|
break
|
|
|
|
# Performers — wszyscy `/models/<slug>/`
|
|
performers: list[RawPerformer] = []
|
|
seen_perf: set[str] = set()
|
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
if slug in seen_perf:
|
|
continue
|
|
seen_perf.add(slug)
|
|
performers.append(
|
|
RawPerformer(
|
|
external_id=f"freshpornoorg:model:{slug}",
|
|
name=name,
|
|
)
|
|
)
|
|
|
|
# Tags
|
|
tags: list[RawTag] = []
|
|
seen_tag: set[str] = set()
|
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
|
slug, name = m.group(1), m.group(2).strip()
|
|
# Skip multi-tag composite slugs (freshporno czasem emituje URL-e
|
|
# typu /tags/face-sitting-fake-tits-freckles-girlfriend-... które
|
|
# są kombinacją tagów, nie pojedynczym tagiem). Normalne tagi mają
|
|
# <40 znaków, >60 to na pewno bug.
|
|
if len(slug) > 60:
|
|
continue
|
|
if slug in seen_tag:
|
|
continue
|
|
seen_tag.add(slug)
|
|
tags.append(
|
|
RawTag(external_id=f"freshpornoorg:tag:{slug}", name=name, slug=slug)
|
|
)
|
|
|
|
# Phash z thumbnail. Wiemy że freshporno używa internal screenshots (preview.mp4.jpg)
|
|
# więc to też może nie matchować canonical phashy — ale test pokaże.
|
|
fingerprints: list[RawFingerprint] = []
|
|
if thumbnail_url:
|
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
|
if ph:
|
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
|
|
|
playback_sources = [
|
|
RawPlaybackSource(
|
|
origin=f"tube:{self.sitetag}",
|
|
page_url=scene_url,
|
|
duration_sec=duration_sec,
|
|
thumbnail_url=thumbnail_url,
|
|
)
|
|
]
|
|
|
|
return RawScene(
|
|
external_id=f"{self.sitetag}:{scene_url}",
|
|
title=title,
|
|
description=description,
|
|
duration_sec=duration_sec,
|
|
url=scene_url,
|
|
studio=studio,
|
|
performers=performers,
|
|
tags=tags,
|
|
fingerprints=fingerprints,
|
|
playback_sources=playback_sources,
|
|
)
|