goon/app/connectors/direct_scrapers/freshporno.py
https://github.com/goon-foss/goon 642f1ab8b8 Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector
Mobile / OTA:
- Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org
- Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version)
- backend.ts: default public backend auto-connect (no manual login)

WebView fallback fix (PlayerScreen INJECTED_JS):
- Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init)
- Context-scoped: only clicks consent buttons inside cookie/gdpr containers
- Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init)

Resolver:
- Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited)
  → composite_score hard-reject / cap; wired into scene_score + bulk_dedup
- aggregator-mode candidate query: LIMIT 500 + title-match ordering

Connectors:
- porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot

landing: APK links → goon-v0.1.9.apk

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 11:20:57 +02:00

191 lines
7.1 KiB
Python

"""freshporno.org — latest-vids browse scraper.
Pilot #2 (po shyfap fail). Hipoteza: freshporno zachowuje oryginalne studio titles
("Straighten Her Out" zamiast custom rebranding jak shyfap) → title fuzzy match
do canonical zadziała. Bonus: channel = studio 1:1 (Pure Taboo, Brazzers, etc.).
URL patterns:
- Listing: `/` (page 1), `/2/`, `/3/`, ... (last `/391/` w czasie pisania)
- Scene: `/videos/<slug>/`
- Channels: `/channels/<slug>/` (= studio)
- Models: `/models/<slug>/` (= performer)
- Tags: `/tags/<slug>/` (= category)
"""
from __future__ import annotations
import re
from datetime import date, datetime, timedelta
from urllib.parse import urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://freshporno.org"
_SCENE_URL_RE = re.compile(r'href="(https://freshporno\.org/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
_CHANNEL_LINK_RE = re.compile(
r'href="https://freshporno\.org/channels/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_MODEL_LINK_RE = re.compile(
r'href="https://freshporno\.org/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://freshporno\.org/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
# Duration via <time datetime="PT46M01S"> (ISO 8601 duration). Fallback: meta property
_TIME_DURATION_RE = re.compile(r'<time[^>]+datetime="PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"', re.IGNORECASE)
def _parse_iso_duration_to_sec(html: str) -> int | None:
m = _TIME_DURATION_RE.search(html)
if not m:
return None
h = int(m.group(1) or 0)
mn = int(m.group(2) or 0)
s = int(m.group(3) or 0)
return h * 3600 + mn * 60 + s
class FreshpornoScraper(BaseBrowseScraper):
sitetag = "freshpornoorg"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/"
return f"{_BASE}/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
title = meta_content(detail_html, property="og:title")
if not title:
m = re.search(r"<title>([^<]+)</title>", detail_html, re.IGNORECASE)
if m:
title = m.group(1).strip()
if not title:
return None
description = meta_content(detail_html, property="og:description") or meta_content(
detail_html, name="description"
)
# Duration: <meta property="video:duration"> w sekundach LUB <time datetime="PT46M01S">
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
else:
duration_sec = _parse_iso_duration_to_sec(detail_html)
thumbnail_url = meta_content(detail_html, property="og:image")
# Channel = studio. Pierwszy `/channels/<slug>/` link na stronie body
# (top nav też ma channels list ale to inny pattern z `/channels/" zatrzymanym)
studio: RawStudio | None = None
# Skipnij nav linki ze stringiem "Channels" jako anchor text — bierzemy specific channel
for m in _CHANNEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if name.lower() in ("channels", ""):
continue
studio = RawStudio(
external_id=f"freshpornoorg:channel:{slug}",
name=name,
slug=slug,
)
break
# Performers — wszyscy `/models/<slug>/`
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"freshpornoorg:model:{slug}",
name=name,
)
)
# Tags
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
# Skip multi-tag composite slugs (freshporno czasem emituje URL-e
# typu /tags/face-sitting-fake-tits-freckles-girlfriend-... które
# są kombinacją tagów, nie pojedynczym tagiem). Normalne tagi mają
# <40 znaków, >60 to na pewno bug.
if len(slug) > 60:
continue
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"freshpornoorg:tag:{slug}", name=name, slug=slug)
)
# Phash z thumbnail. Wiemy że freshporno używa internal screenshots (preview.mp4.jpg)
# więc to też może nie matchować canonical phashy — ale test pokaże.
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
# Release date — freshporno emituje `<meta itemprop="uploadDate" content="2026-05-20T...">`.
# To data wrzucenia na freshporno, NIE oryginalna release_date studio — ale dla
# świeżych scen (uploaded niedługo po publikacji) różnica ≤ 3-7 dni, mieści się w
# `date_window_days=7` w resolverze. Bez tego pola scene NULL → match score 0 →
# duplicate scene zamiast freshporno PS dodane do TPDB canonical (bug-report
# 2026-05-20: brak Brazzers Exxtra po 15-05).
release_date_parsed: date | None = None
if (m := re.search(r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', detail_html)):
try:
release_date_parsed = date.fromisoformat(m.group(1))
except ValueError:
pass
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
release_date=release_date_parsed,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)