Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector

Mobile / OTA:
- Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org
- Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version)
- backend.ts: default public backend auto-connect (no manual login)

WebView fallback fix (PlayerScreen INJECTED_JS):
- Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init)
- Context-scoped: only clicks consent buttons inside cookie/gdpr containers
- Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init)

Resolver:
- Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited)
  → composite_score hard-reject / cap; wired into scene_score + bulk_dedup
- aggregator-mode candidate query: LIMIT 500 + title-match ordering

Connectors:
- porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot

landing: APK links → goon-v0.1.9.apk

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
https://github.com/goon-foss/goon 2026-05-22 11:20:57 +02:00
parent ad0284585b
commit 642f1ab8b8
36 changed files with 1825 additions and 100 deletions

7
.gitignore vendored
View file

@ -71,6 +71,13 @@ mcp-logs.txt
# ADB / development debug artefakty (screenshots, ui dumps) # ADB / development debug artefakty (screenshots, ui dumps)
.tmp_adb/ .tmp_adb/
# Marketing screenshots — kept local, hosted externally for posts/landing.
# NOT committed: explicit thumbnails risk GitHub TOS takedown.
screenshots/
# Launch / marketing material — local working notes, not part of the codebase.
launch/
# Operational deploy scripts — moved to a private companion repo. Public repo # Operational deploy scripts — moved to a private companion repo. Public repo
# should NOT contain SSH commands, systemd units, or smoke-test playbooks # should NOT contain SSH commands, systemd units, or smoke-test playbooks
# referencing concrete hosts. # referencing concrete hosts.

View file

@ -184,21 +184,19 @@ def resolve_movie_playback(
pb.id, pb.id,
) )
stream = None stream = None
# Mixdrop mxcontent CDN wymaga curl_cffi JA3 → wymusza VPS proxy.
# Pre-public: skip mixdrop direct, fallback na embed_url (mobile WebView z
# phone IP). Bandwidth + anonimowość VPS > UX. Movie ma zwykle 10+ alt
# hosterów (voe/luluvid/doply/etc.), user może wybrać alternative.
if stream and "mxcontent.net" in stream.lower():
log.info(
"movie playback %s: mixdrop mxcontent — skip (VPS-proxy required), WebView fallback",
pb.id,
)
stream = None
if stream: if stream:
type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4" type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4"
# Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop):
# proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True`
# idzie przez `raw` → `_proxify_link` ustawi token `i=1`.
cdn_needs_impersonate = "mxcontent.net" in stream.lower()
raw_meta: dict = {"origin": pb.origin, "host": target} raw_meta: dict = {"origin": pb.origin, "host": target}
if cdn_needs_impersonate:
raw_meta["proxy_impersonate"] = True
# Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4.
# Backend extract zamknął sesję — proxy musi re-fetchować
# embed page w fresh curl_cffi session żeby re-extract mp4
# z aktualnymi cookies.
raw_meta["refetch_url"] = target
raw_meta["refetch_hoster"] = "mixdrop"
links.append( links.append(
StreamLink( StreamLink(
stream_url=stream, stream_url=stream,

View file

@ -72,13 +72,22 @@ class Settings(BaseSettings):
sched_movie_ingest_hours: int = Field( sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS" default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
) )
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie. # Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
sched_browse_latest_hours: int = Field( sched_browse_latest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS" default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
) )
sched_browse_latest_max_pages: int = Field( sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES" default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
) )
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens # Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log). # w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).

View file

@ -137,6 +137,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.** # (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.**
from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402 from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402
from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402 from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402 from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401 from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
@ -152,6 +153,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
# 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title # 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title
# fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować. # fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować.
Porn00Scraper, Porn00Scraper,
# PornDoeScraper — dołączony 2026-05-21 (theporndude audit). Każda scena ma
# kompletny JSON-LD VideoObject: title + uploadDate + duration + named studio
# (producer/publisher) + named performers (actor[]) + thumbnail. Najbogatsze
# strukturalne metadane spośród browse scraperów — composite fuzzy match ma
# komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio +
# performer + date + duration nadrabiają.
PornDoeScraper,
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
# Follow-up: dorobić te tubey i sprawdzić phash distance: # Follow-up: dorobić te tubey i sprawdzić phash distance:
# - fullmovies.xxx (channel/network/pornstars/categories, brak duration) # - fullmovies.xxx (channel/network/pornstars/categories, brak duration)

View file

@ -163,11 +163,25 @@ class FreshpornoScraper(BaseBrowseScraper):
) )
] ]
# Release date — freshporno emituje `<meta itemprop="uploadDate" content="2026-05-20T...">`.
# To data wrzucenia na freshporno, NIE oryginalna release_date studio — ale dla
# świeżych scen (uploaded niedługo po publikacji) różnica ≤ 3-7 dni, mieści się w
# `date_window_days=7` w resolverze. Bez tego pola scene NULL → match score 0 →
# duplicate scene zamiast freshporno PS dodane do TPDB canonical (bug-report
# 2026-05-20: brak Brazzers Exxtra po 15-05).
release_date_parsed: date | None = None
if (m := re.search(r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', detail_html)):
try:
release_date_parsed = date.fromisoformat(m.group(1))
except ValueError:
pass
return RawScene( return RawScene(
external_id=f"{self.sitetag}:{scene_url}", external_id=f"{self.sitetag}:{scene_url}",
title=title, title=title,
description=description, description=description,
duration_sec=duration_sec, duration_sec=duration_sec,
release_date=release_date_parsed,
url=scene_url, url=scene_url,
studio=studio, studio=studio,
performers=performers, performers=performers,

View file

@ -0,0 +1,271 @@
"""porndoe.com — latest-vids browse scraper.
Dołączony 2026-05-21 (theporndude audit). Jedyny verified high-value candidate
z 172 tube'ów na theporndude.com/top-porn-tube-sites + /full-porn-movies-sites.
Czemu wart: każda scena ma kompletny **JSON-LD VideoObject** schema:
- name (title), description, uploadDate (ISO timestamp), duration (ISO 8601)
- producer + publisher named studio z `/channel-profile/<slug>` URL
- actor[] named performers z `/pornstars-profile/<slug>` URL
- thumbnailUrl (CDN p.cdnc.porndoe.com)
To wystarczy do composite fuzzy match w resolverze (studio + performer Jaccard +
date proximity + title token-set + duration). Phash hit-rate niski (porndoe robi
własne crop-thumbnaile 390x219, nie hot-linkuje studio art) ale rich metadata
nadrabia, jak pornxp/porn00.
URL patterns:
- Listing: `/videos/most-recent?page=N` (page 1 = newest, ~31 scen/page)
- Scene: `/watch/<id>` gdzie id = `pd` + 10 alfanum (stable)
- Studio: `/channel-profile/<slug>`
- Performer: `/pornstars-profile/<slug>`
- Tags/categories: `/categories/<slug>`
Playback: stream URL NIE jest inline w SSR HTML player JS init dopiero po user
"Play" click. Dajemy playback_source z page_url + origin `tube:porndoecom`;
extractor w `_REGISTRY` mapuje na `_vps_blocked_fallback.extract` mobile WebView
INJECTED_JS scrapuje `<video>.src` po phone IP (0 VPS bandwidth, zgodne z
pre-public bandwidth/anonimowość priorytet).
"""
from __future__ import annotations
import json
import logging
import re
from datetime import date, datetime
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
log = logging.getLogger(__name__)
_BASE = "https://porndoe.com"
# Scene listing — `<a href="/watch/pd7a3o0e8v2b">`. Id = `pd` + alfanum.
_SCENE_URL_RE = re.compile(r'href="(/watch/[a-z0-9]+)"', re.IGNORECASE)
_WATCH_ID_RE = re.compile(r"/watch/([a-z0-9]+)", re.IGNORECASE)
# JSON-LD <script> bloki.
_JSONLD_RE = re.compile(
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
re.IGNORECASE | re.DOTALL,
)
# Tagi/kategorie z DOM (JSON-LD genre bywa pusty). porndoe URL: `/category/<id>/<slug>`.
_TAG_LINK_RE = re.compile(
r'href="/category/\d+/([a-z0-9\-]+)"[^>]*>([^<]+)</a>', re.IGNORECASE
)
# ISO 8601 duration — porndoe emituje "PT8M0S" (czasem "T8M0S" bez P).
_ISO_DUR_RE = re.compile(
r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE
)
def _parse_iso_duration(value: str | None) -> int | None:
"""`PT11M7S` / `T8M0S` → sekundy. None gdy format nieznany."""
if not value:
return None
m = _ISO_DUR_RE.match(value.strip())
if not m:
return None
h = int(m.group(1) or 0)
mn = int(m.group(2) or 0)
s = int(m.group(3) or 0)
total = h * 3600 + mn * 60 + s
return total or None
def _parse_iso_date(value: str | None) -> date | None:
"""`2026-05-20T14:55:13+00:00` → date. None gdy parse fail."""
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
except ValueError:
# Fallback: pierwsze 10 znaków YYYY-MM-DD
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
if m:
try:
return date.fromisoformat(m.group(1))
except ValueError:
return None
return None
def _slug_from_url(url: str | None) -> str | None:
"""`https://porndoe.com/channel-profile/fantasy-girl-pass` → `fantasy-girl-pass`."""
if not url:
return None
m = re.search(r"/(?:channel-profile|pornstars-profile)/([a-z0-9\-]+)", url, re.IGNORECASE)
return m.group(1) if m else None
def _iter_jsonld_objects(data: object):
"""Spłaszcza JSON-LD: dict / list / @graph → strumień dict-ów."""
if isinstance(data, dict):
graph = data.get("@graph")
if isinstance(graph, list):
for item in graph:
yield from _iter_jsonld_objects(item)
else:
yield data
elif isinstance(data, list):
for item in data:
yield from _iter_jsonld_objects(item)
def _extract_video_object(html: str) -> dict | None:
"""Znajdź pierwszy JSON-LD VideoObject w HTML."""
for m in _JSONLD_RE.finditer(html):
raw = m.group(1).strip()
if not raw:
continue
try:
data = json.loads(raw)
except (json.JSONDecodeError, ValueError):
continue
for obj in _iter_jsonld_objects(data):
if obj.get("@type") == "VideoObject":
return obj
return None
class PornDoeScraper(BaseBrowseScraper):
sitetag = "porndoecom"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/videos/most-recent"
return f"{_BASE}/videos/most-recent?page={page}"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = f"{_BASE}{m.group(1)}"
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
video = _extract_video_object(detail_html)
if not video:
log.info("porndoe: no JSON-LD VideoObject on %s", scene_url)
return None
title = (video.get("name") or "").strip()
if not title:
return None
watch_id_m = _WATCH_ID_RE.search(scene_url)
watch_id = watch_id_m.group(1) if watch_id_m else None
description = (video.get("description") or "").strip() or None
duration_sec = _parse_iso_duration(video.get("duration"))
release_date = _parse_iso_date(
video.get("uploadDate") or video.get("datePublished")
)
thumbnail_url = video.get("thumbnailUrl") or None
# Studio: producer / publisher (Organization). Preferuj producer.
studio: RawStudio | None = None
for key in ("producer", "publisher"):
org = video.get(key)
if isinstance(org, dict) and org.get("name"):
name = org["name"].strip()
slug = _slug_from_url(org.get("url")) or re.sub(
r"[^a-z0-9]+", "-", name.lower()
).strip("-")
if name:
studio = RawStudio(
external_id=f"{self.sitetag}:channel:{slug}",
name=name,
slug=slug,
)
break
# Performers: actor[] (lista Person lub pojedynczy Person).
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
actors = video.get("actor")
if isinstance(actors, dict):
actors = [actors]
if isinstance(actors, list):
for actor in actors:
if not isinstance(actor, dict):
continue
name = (actor.get("name") or "").strip()
if not name:
continue
slug = _slug_from_url(actor.get("url")) or re.sub(
r"[^a-z0-9]+", "-", name.lower()
).strip("-")
if slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:performer:{slug}",
name=name,
)
)
# Tagi: z DOM (`/categories/<slug>` / `/tags/<slug>`).
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if not name or name.lower() in ("categories", "tags", ""):
continue
if slug in seen_tag or len(slug) > 60:
continue
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
)
# Phash z thumbnail (porndoe robi własne crop-thumbnaile — niski hit-rate
# oczekiwany, ale graceful: brak match → resolver spada do composite scoring).
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
# Playback — page_url do scena strony. Stream JS-rendered, więc extractor
# `porndoecom` → `_vps_blocked_fallback.extract` (mobile WebView scrape).
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{watch_id or scene_url}",
title=title,
description=description,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -50,12 +50,13 @@ log = logging.getLogger(__name__)
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod # embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform). # tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = { _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# Custom (zoptymalizowane / niestandardowy player) # hqporner — CDN URLs IP-bound do VPS, force_proxy wymusza ruch przez VPS proxy.
# hqporner — CDN URL (bigcdn.cc, video.flyflv.com z `ip=` parametrem) IP-bound do # 2026-05-20 (pre-public): bandwidth + anonimowość VPS > UX. Switch na WebView
# requestera. VPS resolve daje 200 ale mobile direct = 404/403. Switch na WebView # fallback — mobile pobiera embed iframe z phone IP, FluidPlayer JS decoduje
# fallback: mobile pobiera embed iframe (mydaddy.cc/hqwo.cc) z phone IP, FluidPlayer # mp4, ExoPlayer odtwarza direct z phone CDN session. **0 VPS bandwidth + VPS
# JS decoduje mp4 URL z mobile session. Plus INJECTED_JS skanuje `<source>.src`. # IP nie ujawniony** (mobile nie łączy się z VPS proxy URL).
# ~32k scen (drugi po porntrex największy single saving). Verified 2026-05-18. # Trade-off: WebView ma 1 extra step (page → player JS) ale bez popup-ads jak
# hqporner.com bo INJECTED_JS w PlayerScreen.tsx blokuje + scrape `<source>.src`.
"hqpornercom": _vps_blocked_fallback.extract, "hqpornercom": _vps_blocked_fallback.extract,
"epornercom": eporner.extract, "epornercom": eporner.extract,
"sxyprncom": sxyprn.extract, "sxyprncom": sxyprn.extract,
@ -94,13 +95,12 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy # trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
# follow_redirects=True wymagane (fix w stream_proxy.py). # follow_redirects=True wymagane (fix w stream_proxy.py).
"pornhatcom": pornhat.extract, "pornhatcom": pornhat.extract,
# Freshporno KVS — `cv=` HMAC signed token IP-bound. Server-side resolve dało # Freshporno KVS — `cv=` HMAC signed token IP-bound do VPS. 2026-05-20 pre-public:
# 200 z VPS, ale laptop dostał 302+SSL error → token validate'uje requester IP. # bandwidth + VPS anonimowość priorytet. WebView fallback → mobile pobiera embed
# Switch na WebView fallback: mobile pobiera embed page, KVS player decoduje # z phone IP, KVS player JS decoduje video_url, ExoPlayer odtwarza direct z CDN.
# video_url w-page, ExoPlayer dostaje URL z phone session. ~15k scen.
"freshpornoorg": _vps_blocked_fallback.extract, "freshpornoorg": _vps_blocked_fallback.extract,
# porn00 / pornxp — force_proxy=True wprost (IP-bound CDN). Switch na WebView # porn00 / pornxp — IP-bound CDN tokens. Pre-public WebView fallback (bandwidth +
# fallback. Niski volume (84 scen), trivial saving ale konsystencja flow. # anonimowość VPS). Niski volume (84 scen), trivial.
"porn00org": _vps_blocked_fallback.extract, "porn00org": _vps_blocked_fallback.extract,
"pornxpph": _vps_blocked_fallback.extract, "pornxpph": _vps_blocked_fallback.extract,
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/) # Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
@ -114,6 +114,11 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
"perverzijacom": _embed_iframe.extract, "perverzijacom": _embed_iframe.extract,
# Special: WebView-only (Yii2 session-bound player). # Special: WebView-only (Yii2 session-bound player).
"paradisehillcc": paradisehill.extract, "paradisehillcc": paradisehill.extract,
# PornDoe — dołączony 2026-05-21 (theporndude audit). Stream URL nie inline w
# SSR HTML (player JS init po Play click), więc WebView fallback: mobile pobiera
# /watch/<id> z phone IP, player JS dekoduje video.src, INJECTED_JS scrape.
# 0 VPS bandwidth — zgodne z pre-public bandwidth/anonimowość priorytet.
"porndoecom": _vps_blocked_fallback.extract,
} }

View file

@ -45,7 +45,15 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
if basename in seen_keys: if basename in seen_keys:
continue continue
seen_keys.add(basename) seen_keys.add(basename)
result.append(StreamSource(link=url, type="mp4", quality=quality)) # `force_proxy=True` (2026-05-20): freshporno get_file 302 → cdn4.freshporno.org
# IP-bound (cv= HMAC token). Mobile direct = 403/SSL fail → fallback proxy
# generuje "mrugnięcie" (user bug 743eefbf "najpierw strona potem video").
# Force_proxy wymusza mobile użycie proxied URL od razu — bez flickera +
# natywny ExoPlayer + quality picker zachowane.
result.append(StreamSource(
link=url, type="mp4", quality=quality,
raw={"force_proxy": True},
))
if not result: if not result:
log.info("freshporno: no MP4 anchor matches on %s", page_url) log.info("freshporno: no MP4 anchor matches on %s", page_url)

View file

@ -95,7 +95,15 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
continue continue
seen_urls.add(url) seen_urls.add(url)
title = (sm.group(2) or "").strip() title = (sm.group(2) or "").strip()
sources.append(StreamSource(link=url, quality=title or None, type="mp4", referer=iframe_referer)) # `force_proxy=True` (2026-05-20): CDN-y bigcdn.cc/flyflv IP-bound + flyflv ma
# `ip=46.62.219.154` w URL path. Mobile direct = 404/403 → fallback proxy
# generuje flicker. Force_proxy wymusza mobile użycie proxied od razu.
# Bug-report e8ddd8d4: "kliknięcie otwiera reklamę" gdy _vps_blocked_fallback
# (hqporner page ads). Force_proxy + native mp4 = quality picker + natywny.
sources.append(StreamSource(
link=url, quality=title or None, type="mp4", referer=iframe_referer,
raw={"force_proxy": True},
))
if sources: if sources:
return sources return sources

View file

@ -111,7 +111,7 @@ def version() -> dict[str, str | None]:
# mobile sklei z baseUrl. # mobile sklei z baseUrl.
public_url = os.environ.get("BACKEND_PUBLIC_URL", "").rstrip("/") public_url = os.environ.get("BACKEND_PUBLIC_URL", "").rstrip("/")
apk_url = f"{public_url}/static/app-release.apk" if public_url else "/static/app-release.apk" apk_url = f"{public_url}/static/app-release.apk" if public_url else "/static/app-release.apk"
return {"version": "0.1.8", "apk_url": apk_url} return {"version": "0.1.9", "apk_url": apk_url}
@app.get("/readyz") @app.get("/readyz")

View file

@ -123,14 +123,38 @@ def resolve_scene(
result = find_by_phash_within(session, phash=value) result = find_by_phash_within(session, phash=value)
if result is not None: if result is not None:
scene_match, distance = result scene_match, distance = result
score = 1.0 - distance / 64.0 raw_phash_score = 1.0 - distance / 64.0
# Duration sanity check: phash może collide gdy compilation zawiera chapter sceny # Duration sanity check: phash może collide gdy compilation zawiera chapter sceny
# (oba mają ten sam frame sample), ale duration będzie wyraźnie inny. # (oba mają ten sam frame sample), ale duration będzie wyraźnie inny.
# Wymagamy proximity ≥0.5 (±30s) dla auto-merge; inaczej → review queue. # Wymagamy proximity ≥0.5 (±30s) dla auto-merge; inaczej → review queue.
from app.resolve.scoring import duration_proximity from app.resolve.scoring import duration_proximity, series_mismatch_strength
dur_prox = duration_proximity(scene_match.duration_sec, norm.duration_sec) dur_prox = duration_proximity(scene_match.duration_sec, norm.duration_sec)
# Series-position guard (Episode 2 vs Episode 4): phash zwykle pixel-identical
# bo studio reusuje cover art między episodami, ale to OSOBNE sceny. Hard split,
# bez merge_candidate (nie ma czego mergować — żaden human reviewer też nie
# powie "Episode 2 to to samo co Episode 4").
sp_strength = series_mismatch_strength(
scene_match.title_normalized, norm.title_normalized
)
if sp_strength >= 1.0:
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
return SceneResolveResult(
scene=new_scene,
was_created=True,
path="fp_phash_series_split",
score=0.0,
)
if dur_prox is not None and dur_prox < 0.5: if dur_prox is not None and dur_prox < 0.5:
# phash match ale duration rozjeżdża się → tworzymy nową scenę + review. # phash match ale duration rozjeżdża się → tworzymy nową scenę + review.
# Score reflectuje że to NIE jest auto-merge: dur_prox * phash_score,
# plus dalej cap przez series modifier mismatch (BTS/bonus/unedited).
penalised_score = raw_phash_score * max(dur_prox, 0.1)
if 0.0 < sp_strength < 1.0:
penalised_score = min(penalised_score, 1.0 - sp_strength)
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id) new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm) _attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id) _sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
@ -139,11 +163,14 @@ def resolve_scene(
kind=MergeKind.scene, kind=MergeKind.scene,
left_id=scene_match.id, left_id=scene_match.id,
right_id=new_scene.id, right_id=new_scene.id,
score=score, score=penalised_score,
reasons={ reasons={
"path": "fp_phash", "path": "fp_phash",
"hamming": distance, "hamming": distance,
"phash_score": raw_phash_score,
"duration_mismatch": True, "duration_mismatch": True,
"dur_prox": dur_prox,
"series_mismatch_strength": sp_strength,
"left_dur": scene_match.duration_sec, "left_dur": scene_match.duration_sec,
"right_dur": norm.duration_sec, "right_dur": norm.duration_sec,
}, },
@ -154,9 +181,42 @@ def resolve_scene(
scene=new_scene, scene=new_scene,
was_created=True, was_created=True,
path="fp_phash_review", path="fp_phash_review",
score=score, score=penalised_score,
candidate_id=scene_match.id, candidate_id=scene_match.id,
) )
# Modifier tag mismatch (BTS/bonus/unedited po jednej stronie) — nie hard-split,
# ale auto-merge zablokowane: tworzymy nową scenę + pending review.
if 0.0 < sp_strength < 1.0:
penalised_score = min(raw_phash_score, 1.0 - sp_strength)
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
session.add(
MergeCandidate(
kind=MergeKind.scene,
left_id=scene_match.id,
right_id=new_scene.id,
score=penalised_score,
reasons={
"path": "fp_phash",
"hamming": distance,
"phash_score": raw_phash_score,
"series_modifier_mismatch": True,
"series_mismatch_strength": sp_strength,
},
status=MergeStatus.pending,
)
)
return SceneResolveResult(
scene=new_scene,
was_created=True,
path="fp_phash_modifier_review",
score=penalised_score,
candidate_id=scene_match.id,
)
score = raw_phash_score
_update_scene_fields(scene_match, norm, studio_id=studio_id, source_kind=source_kind, session=session) _update_scene_fields(scene_match, norm, studio_id=studio_id, source_kind=source_kind, session=session)
_attach_external_ref(session, scene_id=scene_match.id, source_id=source_id, norm=norm) _attach_external_ref(session, scene_id=scene_match.id, source_id=source_id, norm=norm)
_sync_attached_entities(session, scene=scene_match, norm=norm, source_id=source_id) _sync_attached_entities(session, scene=scene_match, norm=norm, source_id=source_id)
@ -215,14 +275,24 @@ def resolve_scene(
# które mają wspólny choć jeden performer z naszą sceną (mocny sygnał — performerzy # które mają wspólny choć jeden performer z naszą sceną (mocny sygnał — performerzy
# to też nasz "blocking key" gdy studio i date są nieinformatywne). # to też nasz "blocking key" gdy studio i date są nieinformatywne).
if aggregator_mode and performer_ids: if aggregator_mode and performer_ids:
from sqlalchemy import distinct # **2026-05-20 fix**: poprzednio LIMIT 50 BEZ ORDER BY → dla popular performera
# (Eveline Dellai z 200+ scen w bazie) prawdziwy match mógł być out of top-50,
# postgres zwracał arbitrary order → resolver nie widział kandydata → duplicate.
# Bug-report: brak Brazzers Exxtra po 15-05. Now: 500 limit + title-match priority
# ORDER, plus exact title match jako gwarantowany kandydat (CASE expression).
from sqlalchemy import case
title_match_expr = case(
(Scene.title_normalized == norm.title_normalized, 1),
else_=0,
)
more = ( more = (
session.execute( session.execute(
select(Scene) select(Scene)
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id) .join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
.where(ScenePerformer.performer_id.in_(performer_ids)) .where(ScenePerformer.performer_id.in_(performer_ids))
.group_by(Scene.id) .group_by(Scene.id)
.limit(50) .order_by(title_match_expr.desc(), Scene.release_date.desc().nullslast())
.limit(500)
) )
.scalars() .scalars()
.all() .all()

View file

@ -16,6 +16,7 @@ from app.resolve.scoring import (
duration_proximity, duration_proximity,
performer_set_similarity, performer_set_similarity,
phash_similarity, phash_similarity,
series_mismatch_strength,
title_similarity, title_similarity,
) )
@ -49,6 +50,10 @@ def score_candidate(
else: else:
studio_match = candidate.studio_id == studio_id studio_match = candidate.studio_id == studio_id
series_mismatch = series_mismatch_strength(
candidate.title_normalized, norm.title_normalized
)
composite, reasons = composite_score( composite, reasons = composite_score(
fp=fp, fp=fp,
title=title, title=title,
@ -57,6 +62,7 @@ def score_candidate(
duration_score=duration_score, duration_score=duration_score,
studio_match=studio_match, studio_match=studio_match,
aggregator_mode=aggregator_mode, aggregator_mode=aggregator_mode,
series_mismatch=series_mismatch,
) )
breakdown = ScoreBreakdown( breakdown = ScoreBreakdown(

View file

@ -16,6 +16,7 @@ TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
from __future__ import annotations from __future__ import annotations
import math import math
import re
import uuid import uuid
from collections.abc import Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
@ -105,6 +106,91 @@ def date_proximity(left: date | None, right: date | None, *, window_days: int =
return 1.0 - delta / window_days return 1.0 - delta / window_days
# Wyłapuje "Episode 4" / "Ep 4" / "Part 2" / "Pt. 3" / "Vol 7" / "Volume 12" /
# "Scene 5" / "Chapter 9" / "Ch.3" / "#7" / "S9:E8" / "S9E8" — wszystko po
# normalizacji (lower-cased, punkt usunięty zwykle, ale tolerujemy \\.).
# `(?<!\d)` + `(?!\d)` zapobiega wyłapaniu fragmentu cyfry z dłuższego ciągu —
# np. "scene from 2020" nie wygeneruje fałszywego pos=0 z boundary-end-of-2020.
_SERIES_NUM_RE = re.compile(
r"\b(?:episode|ep|part|pt|vol|volume|chapter|ch|scene|series)\b\s*\.?\s*#?\s*(?<!\d)(\d{1,3})(?!\d)"
r"|(?<!\w)#\s*(?<!\d)(\d{1,3})(?!\d)"
r"|\bs(?<!\d)(\d{1,2})(?!\d)\s*[:e]\s*e?(?<!\d)(\d{1,3})(?!\d)",
re.IGNORECASE,
)
# Tagi które wprost mówią że scena to wariant osobny (BTS / bonus / unedited /
# trailer). Jeśli tag jest TYLKO po jednej stronie, to NIE jest ta sama scena.
_MODIFIER_TAGS: tuple[str, ...] = (
"behind the scenes",
"behind-the-scenes",
"bts",
"bonus",
"unedited",
"uncut",
"extended",
"directors cut",
"director's cut",
"trailer",
"preview",
"teaser",
"compilation",
)
def detect_series_positions(title_normalized: str | None) -> set[int]:
"""Zwraca wszystkie pozycje (Episode/Part/Vol/Scene/Chapter/# itp.) znalezione w tytule.
Tytuł powinien być znormalizowany (lowercase, unaccent), ale regex jest case-insensitive
i tolerancyjny chodzi tylko o sygnał, nie o robust parsing.
"""
if not title_normalized:
return set()
out: set[int] = set()
for m in _SERIES_NUM_RE.finditer(title_normalized):
for g in m.groups():
if g and g.isdigit():
out.add(int(g))
return out
def detect_modifier_tags(title_normalized: str | None) -> set[str]:
"""Zwraca set modifier tagów wykrytych w tytule (bts/bonus/unedited/itp.)."""
if not title_normalized:
return set()
lower = title_normalized.lower()
return {t for t in _MODIFIER_TAGS if t in lower}
def series_mismatch_strength(
title_a_normalized: str | None,
title_b_normalized: str | None,
) -> float:
"""Wykrywa rozjazd "wariantu sceny" między tytułami.
Zwraca strength w [0.0, 1.0]:
0.0 brak sygnału mismatchu (tytuły kompatybilne).
0.5 modifier tags po obu stronach ale RÓŻNE (BTS vs trailer).
0.7 modifier tag po jednej stronie tylko (BTS vs regular).
1.0 series position mismatch (Episode 2 vs Episode 4 twardy reject).
"""
pos_a = detect_series_positions(title_a_normalized)
pos_b = detect_series_positions(title_b_normalized)
# Hard mismatch gdy oba mają jakieś pozycje i symmetric difference jest niepusty
# — przykład: "Vol 140 Scene 3" vs "Vol 140 Scene 4" mają wspólne 140 ale różne 3/4,
# to są osobne sceny ze wspólnej kompilacji. Asymetryczny brak (jedna strona ma
# pozycję a druga nie) nie liczy się jako mismatch — tube SEO często gubi numer.
if pos_a and pos_b and (pos_a ^ pos_b):
return 1.0
mod_a = detect_modifier_tags(title_a_normalized)
mod_b = detect_modifier_tags(title_b_normalized)
if (not mod_a) != (not mod_b):
return 0.7
if mod_a and mod_b and not (mod_a & mod_b):
return 0.5
return 0.0
def duration_proximity( def duration_proximity(
left: int | None, right: int | None, *, window_sec: int = 60 left: int | None, right: int | None, *, window_sec: int = 60
) -> float | None: ) -> float | None:
@ -145,6 +231,7 @@ def composite_score(
duration_score: float | None = None, duration_score: float | None = None,
studio_match: bool | None, studio_match: bool | None,
aggregator_mode: bool = False, aggregator_mode: bool = False,
series_mismatch: float | None = None,
) -> tuple[float, dict]: ) -> tuple[float, dict]:
"""Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons. """Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.
@ -153,9 +240,17 @@ def composite_score(
- aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów, - aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
więc studio z naszej perspektywy nie jest informatywny pomijamy hard reject więc studio z naszej perspektywy nie jest informatywny pomijamy hard reject
i zwiększamy wagę performers). i zwiększamy wagę performers).
`series_mismatch` (0.0): wartość z `series_mismatch_strength()` gdy 1.0 (Episode 2
vs Episode 4), wymusza twardy reject niezależnie od pozostałych sygnałów; gdy 0.5-0.7
(modifier mismatch: BTS/bonus/unedited po jednej stronie), nakłada cap = `1 - strength`.
""" """
reasons: dict = {} reasons: dict = {}
if series_mismatch is not None and series_mismatch >= 1.0:
reasons["series_position_mismatch"] = True
return 0.0, reasons
if studio_match is False: if studio_match is False:
if fp is not None and fp >= 0.95: if fp is not None and fp >= 0.95:
reasons["studio_mismatch_overridden_by_fp"] = True reasons["studio_mismatch_overridden_by_fp"] = True
@ -257,6 +352,16 @@ def composite_score(
reasons["duration_perf_strong_match_bump"] = True reasons["duration_perf_strong_match_bump"] = True
score = max(score, 0.92) score = max(score, 0.92)
# Series-modifier cap: jedna ze stron ma "BTS"/"bonus"/"unedited" a druga nie,
# albo różne tagi. Twardy mismatch (różne pozycje numeryczne) został już złapany
# wcześniej (return 0.0). Tu zostają miękkie sygnały — cap żeby nigdy nie auto-merge.
if series_mismatch is not None and 0.0 < series_mismatch < 1.0:
cap = max(0.0, 1.0 - series_mismatch)
if score > cap:
reasons["series_modifier_cap"] = cap
reasons["series_mismatch_strength"] = series_mismatch
score = cap
return _clamp(score), reasons return _clamp(score), reasons

View file

@ -35,6 +35,7 @@ from app.resolve.scoring import (
hamming_distance_hex, hamming_distance_hex,
performer_set_similarity, performer_set_similarity,
phash_similarity, phash_similarity,
series_mismatch_strength,
title_similarity, title_similarity,
triage, triage,
) )
@ -121,6 +122,8 @@ def score_scene_pair(session: Session, a: Scene, b: Scene) -> ScoreBreakdown:
else: else:
studio_match = a.studio_id == b.studio_id studio_match = a.studio_id == b.studio_id
series_mismatch = series_mismatch_strength(a.title_normalized, b.title_normalized)
# Bulk dedup nie jest aggregator — porównujemy dwie kanoniczne sceny, studio # Bulk dedup nie jest aggregator — porównujemy dwie kanoniczne sceny, studio
# to prawdziwe studio. Aggregator mode tylko w resolverze przy ingest z tube'a. # to prawdziwe studio. Aggregator mode tylko w resolverze przy ingest z tube'a.
composite, reasons = composite_score( composite, reasons = composite_score(
@ -131,6 +134,7 @@ def score_scene_pair(session: Session, a: Scene, b: Scene) -> ScoreBreakdown:
duration_score=duration_score, duration_score=duration_score,
studio_match=studio_match, studio_match=studio_match,
aggregator_mode=False, aggregator_mode=False,
series_mismatch=series_mismatch,
) )
return ScoreBreakdown( return ScoreBreakdown(

View file

@ -92,6 +92,29 @@ def _job_movie_ingest() -> None:
log.exception("[scheduler] movie ingest %s failed", name) log.exception("[scheduler] movie ingest %s failed", name)
def _job_bulk_dedup_performers() -> None:
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
resolver-time scoring nie złapał.
Use case (bug-report 2026-05-20, "brak Brazzers Exxtra po 15-05"):
freshporno scrape przed fixem release_date tworzył duplicate scenes zamiast
PS-merge do canonical TPDB scen. Resolver scoring miał score >0.92 (auto)
z release_date, ale BEZ release_date wagi się przesuwały i wpadało w review/new.
Bulk_dedup performers strategy iteruje per performer, robi pair-wise scoring
dla wszystkich scen tego performera łapie duplicate-y które ingest-time
resolver pominął (np. gdy 2 sceny tej samej title+performer ale różny release_date).
Auto-merge gdy score0.92, pending merge_candidate gdy 0.75-0.92.
"""
log.info("[scheduler] bulk_dedup performers starting")
try:
from app.scheduler.bulk_dedup import run_bulk_dedup
bc = run_bulk_dedup(strategy="performers", dry_run=False)
log.info("[scheduler] bulk_dedup performers done: %s", bc)
except Exception:
log.exception("[scheduler] bulk_dedup performers failed")
def _job_performer_continuous(refresh_after_days: int) -> None: def _job_performer_continuous(refresh_after_days: int) -> None:
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST. """Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
@ -174,6 +197,17 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
cfg["browse_latest_hours"], max_pages, cfg["browse_latest_hours"], max_pages,
) )
if cfg.get("bulk_dedup_hours"):
sched.add_job(
_job_bulk_dedup_performers,
IntervalTrigger(hours=cfg["bulk_dedup_hours"]),
id="bulk_dedup_performers",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: bulk-dedup performers every %dh", cfg["bulk_dedup_hours"])
if cfg.get("movie_ingest_hours"): if cfg.get("movie_ingest_hours"):
sched.add_job( sched.add_job(
_job_movie_ingest, _job_movie_ingest,

View file

@ -37,7 +37,8 @@ from app.ingest import (
) )
from app.models.ingest_run import IngestRun, IngestStatus from app.models.ingest_run import IngestRun, IngestStatus
from app.models.performer import Performer, PerformerExternalRef from app.models.performer import Performer, PerformerExternalRef
from app.models.scene import ScenePerformer from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, ScenePerformer
from app.models.source import Source, SourceKind from app.models.source import Source, SourceKind
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -463,14 +464,19 @@ def _claim_next_for_search(
"""Wybiera 1 performera z queue + UPDATE last_searched_at = now() w jednej """Wybiera 1 performera z queue + UPDATE last_searched_at = now() w jednej
transakcji (skip locked safe pod konkurencyjnym workerze). transakcji (skip locked safe pod konkurencyjnym workerze).
Queue: Queue priority (2026-05-20 update orphan-rescue bias):
1. Performerzy NIGDY niesearchowani (last_searched_at IS NULL) 1. **Performerzy z RECENT scenes-without-playback** (last 7d, no live PS)
2. Performerzy searchowani > `refresh_after` temu najpilniejsi bo user widzi puste studio listings dla najnowszych scen.
3. Filtruj scene_count >= min_scene_count (eliminuje noise/false performerów) Bug-report 2026-05-20: "brak Brazzers Exxtra po 15-05" wszystkie nowe
4. Order: NULLS FIRST, potem najstarsze last_searched_at TPDB sceny mają canonical metadata ale 0 playback bo continuous queue
nigdy ich nie dotyka (78k performers, 67k NULL ~232 dni sweep).
2. Performerzy NIGDY niesearchowani (`last_searched_at IS NULL`)
3. Performerzy searchowani > `refresh_after` temu
4. Filtruj scene_count >= min_scene_count
""" """
cutoff = datetime.now(UTC) - refresh_after cutoff = datetime.now(UTC) - refresh_after
# Subquery scene_count orphan_cutoff = datetime.now(UTC) - timedelta(days=7)
sc_sub = ( sc_sub = (
select( select(
ScenePerformer.performer_id.label("pid"), ScenePerformer.performer_id.label("pid"),
@ -480,19 +486,41 @@ def _claim_next_for_search(
.subquery() .subquery()
) )
# NOTE: nie używamy FOR UPDATE bo PostgreSQL nie pozwala na to z GROUP BY # Orphan-scene count per performer: scenes z release_date w ostatnich 7d
# subquery (scene_count agg). APScheduler max_instances=1 gwarantuje że tylko # AND brak żywego playback source. Wysoki count = performer-z-rekordów-pustych.
# jeden tick runa się na raz, więc race nie jest realny. orphan_sub = (
select(
ScenePerformer.performer_id.label("pid"),
func.count(ScenePerformer.scene_id).label("orphan_count"),
)
.join(Scene, Scene.id == ScenePerformer.scene_id)
.where(Scene.release_date > orphan_cutoff)
.where(
~select(PlaybackSource.id)
.where(PlaybackSource.scene_id == Scene.id)
.where(PlaybackSource.dead_at.is_(None))
.exists()
)
.group_by(ScenePerformer.performer_id)
.subquery()
)
row = session.execute( row = session.execute(
select(Performer) select(Performer)
.join(sc_sub, sc_sub.c.pid == Performer.id, isouter=False) .join(sc_sub, sc_sub.c.pid == Performer.id, isouter=False)
.join(orphan_sub, orphan_sub.c.pid == Performer.id, isouter=True)
.where(sc_sub.c.scene_count >= min_scene_count) .where(sc_sub.c.scene_count >= min_scene_count)
.where( .where(
(Performer.last_searched_at.is_(None)) (Performer.last_searched_at.is_(None))
| (Performer.last_searched_at < cutoff) | (Performer.last_searched_at < cutoff)
) )
.order_by( .order_by(
# 1. Orphan scenes (last 7d, no playback) FIRST — desc count.
# COALESCE 0 sprawia że performerzy bez orphan idą za tymi z.
func.coalesce(orphan_sub.c.orphan_count, 0).desc(),
# 2. NULL last_searched_at next
Performer.last_searched_at.asc().nullsfirst(), Performer.last_searched_at.asc().nullsfirst(),
# 3. Highest scene_count (popular performers earlier)
sc_sub.c.scene_count.desc(), sc_sub.c.scene_count.desc(),
) )
.limit(1) .limit(1)

View file

@ -161,6 +161,9 @@ def run_forever() -> int:
# to bulk import jednorazowy). Bug-report 93d3c485 (2026-05-19) "brak freshporno". # to bulk import jednorazowy). Bug-report 93d3c485 (2026-05-19) "brak freshporno".
"browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 24) or None, "browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 24) or None,
"browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5), "browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5),
# Bulk-dedup performers — safety net dla duplikatów które resolver
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
"bulk_dedup_hours": getattr(settings, "sched_bulk_dedup_hours", 12) or None,
} }
sched = build_scheduler(cfg) sched = build_scheduler(cfg)
log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs())) log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))

View file

@ -83,18 +83,14 @@
<p class="text-lg md:text-xl text-gray-400 max-w-2xl leading-relaxed mb-10"> <p class="text-lg md:text-xl text-gray-400 max-w-2xl leading-relaxed mb-10">
Goon indexes scene metadata from TPDB &amp; StashDB, deduplicates across Goon indexes scene metadata from TPDB &amp; StashDB, deduplicates across
30+ public tubes, and serves a fast mobile client. Zero ads. Zero tracking. 30+ public tubes, and serves a fast mobile client. Zero ads. Zero tracking.
Your data stays on your VPS. Download, open, browse — no account, no setup.
</p> </p>
<div class="flex flex-wrap gap-3"> <div class="flex flex-wrap gap-3">
<a href="https://github.com/REPLACE_PERSONA/goon/releases/latest" <a href="/goon-v0.1.9.apk"
class="px-6 py-4 rounded-xl bg-accent text-white font-bold hover:bg-accentDeep transition glow"> class="px-6 py-4 rounded-xl bg-accent text-white font-bold hover:bg-accentDeep transition glow">
Download APK Download APK
</a> </a>
<a href="https://github.com/REPLACE_PERSONA/goon"
class="px-6 py-4 rounded-xl bg-bgElevated border border-border text-gray-200 font-semibold hover:border-accent transition">
View source on GitHub
</a>
<a href="#donate" <a href="#donate"
class="px-6 py-4 rounded-xl bg-transparent border border-border text-gray-400 font-semibold hover:text-accent hover:border-accent transition"> class="px-6 py-4 rounded-xl bg-transparent border border-border text-gray-400 font-semibold hover:text-accent hover:border-accent transition">
♥ Support project ♥ Support project
@ -102,7 +98,7 @@
</div> </div>
<p class="text-xs text-gray-500 mt-6 font-mono"> <p class="text-xs text-gray-500 mt-6 font-mono">
Android only · self-hosted backend required · 18+ Android 7+ · no setup, no login · 18+
</p> </p>
</div> </div>
</header> </header>
@ -165,11 +161,10 @@
<div class="card-hover bg-card border border-border rounded-2xl p-6"> <div class="card-hover bg-card border border-border rounded-2xl p-6">
<div class="text-accent text-2xl mb-3"></div> <div class="text-accent text-2xl mb-3"></div>
<h3 class="text-lg font-bold mb-2">100% self-hosted</h3> <h3 class="text-lg font-bold mb-2">Works out of the box</h3>
<p class="text-sm text-gray-400 leading-relaxed"> <p class="text-sm text-gray-400 leading-relaxed">
One <code class="text-accent font-mono text-xs">docker compose up -d</code> Download the APK and it connects automatically — no account, no
and you own the API, the DB, the worker. No SaaS dependencies. config. Power users can point it at their own self-hosted backend.
Your search history is yours.
</p> </p>
</div> </div>
</div> </div>
@ -200,26 +195,33 @@
</div> </div>
</section> </section>
<!-- QUICK START --> <!-- GET STARTED -->
<section class="px-6 py-20 bg-bgElevated/30 border-y border-border"> <section class="px-6 py-20 bg-bgElevated/30 border-y border-border">
<div class="max-w-3xl mx-auto"> <div class="max-w-3xl mx-auto">
<h2 class="text-3xl font-extrabold mb-3 tracking-tight">Quick start</h2> <h2 class="text-3xl font-extrabold mb-3 tracking-tight">Get started</h2>
<p class="text-gray-500 mb-8">5 commands. Backend runs in 30 seconds on any Docker host.</p> <p class="text-gray-500 mb-8">Three steps. No account, no server, no config.</p>
<div class="bg-card border border-border rounded-2xl p-6 font-mono text-sm leading-relaxed"> <ol class="space-y-4">
<div class="text-gray-500">$ <span class="text-gray-300">git clone https://github.com/REPLACE_PERSONA/goon.git</span></div> <li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
<div class="text-gray-500">$ <span class="text-gray-300">cd goon &amp;&amp; cp .env.example .env</span></div> <span class="text-accent font-extrabold text-xl">1</span>
<div class="text-gray-500">$ <span class="text-gray-300"># edit .env: set TPDB_API_TOKEN, STASHDB_API_KEY, API_KEYS</span></div> <span class="text-sm text-gray-300 leading-relaxed">
<div class="text-gray-500">$ <span class="text-gray-300">docker compose up -d</span></div> <a href="/goon-v0.1.9.apk" class="text-accent font-bold hover:underline">Download the APK</a>
<div class="text-gray-500">$ <span class="text-gray-300">curl localhost:8000/health</span></div> and open it. Allow "install from unknown sources" for your browser if Android asks.
<div class="text-good text-xs mt-3">{"status":"ok"}</div> </span>
</div> </li>
<li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
<p class="text-sm text-gray-500 mt-6"> <span class="text-accent font-extrabold text-xl">2</span>
Then download the APK above, point it at your backend, paste an API key. <span class="text-sm text-gray-300 leading-relaxed">
Full docs in the Open the app, accept the 18+ gate. It connects automatically — no login.
<a href="https://github.com/REPLACE_PERSONA/goon#readme" class="text-accent hover:underline">README</a>. </span>
</p> </li>
<li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
<span class="text-accent font-extrabold text-xl">3</span>
<span class="text-sm text-gray-300 leading-relaxed">
Browse. That's it.
</span>
</li>
</ol>
</div> </div>
</section> </section>
@ -259,10 +261,7 @@
</div> </div>
<p class="text-xs text-gray-500 mt-6"> <p class="text-xs text-gray-500 mt-6">
Addresses are hard-coded in Addresses + QR codes are shown in the app under Scenes &raquo; ♥.
<code class="font-mono text-accent">mobile/src/lib/donate.ts</code>
and shown in the app under Scenes &raquo; ♥. Always verify on-screen
against the repo before sending.
</p> </p>
</div> </div>
</section> </section>
@ -275,19 +274,18 @@
<div class="w-2 h-2 rounded-full bg-accent"></div> <div class="w-2 h-2 rounded-full bg-accent"></div>
<span class="font-bold tracking-widest uppercase">goon</span> <span class="font-bold tracking-widest uppercase">goon</span>
</div> </div>
<p>Self-hosted adult content metadata aggregator.</p> <p>Adult content metadata aggregator. FOSS, ad-free.</p>
<p>MIT license. No warranty. 18+ jurisdictions only.</p> <p>MIT license. No warranty. 18+ jurisdictions only.</p>
</div> </div>
<div class="flex flex-col gap-1 text-right"> <div class="flex flex-col gap-1 text-right">
<a href="https://github.com/REPLACE_PERSONA/goon" class="hover:text-accent transition">GitHub</a> <a href="/goon-v0.1.9.apk" class="hover:text-accent transition">Download APK</a>
<a href="https://github.com/REPLACE_PERSONA/goon/releases" class="hover:text-accent transition">Releases</a> <a href="#donate" class="hover:text-accent transition">Support</a>
<a href="https://github.com/REPLACE_PERSONA/goon#readme" class="hover:text-accent transition">Docs</a>
</div> </div>
</div> </div>
<p class="max-w-5xl mx-auto mt-6 text-[10px] text-gray-600 leading-relaxed"> <p class="max-w-5xl mx-auto mt-6 text-[10px] text-gray-600 leading-relaxed">
Goon does not host, transcode, store, or distribute any media. It scrapes Goon does not host, transcode, store, or distribute any media. It scrapes
publicly-available metadata and links out to the source. Operators are publicly-available metadata and links out to the source. Users are
responsible for complying with local law. See README &raquo; Disclaimer. responsible for complying with local law.
</p> </p>
</footer> </footer>

View file

@ -24,6 +24,7 @@ import { ClientProvider } from './src/ClientContext';
import { ErrorBoundary } from './src/ErrorBoundary'; import { ErrorBoundary } from './src/ErrorBoundary';
import { isAccepted as isAgeGateAccepted } from './src/lib/agegate'; import { isAccepted as isAgeGateAccepted } from './src/lib/agegate';
import { APP_VERSION } from './src/lib/appVersion'; import { APP_VERSION } from './src/lib/appVersion';
import { DEFAULT_API_KEY, DEFAULT_BACKEND_URL } from './src/lib/backend';
import { getSettings as getLockSettings } from './src/lib/applock'; import { getSettings as getLockSettings } from './src/lib/applock';
import { AppNavigator } from './src/navigation'; import { AppNavigator } from './src/navigation';
import { AgeGateScreen } from './src/screens/AgeGateScreen'; import { AgeGateScreen } from './src/screens/AgeGateScreen';
@ -89,7 +90,13 @@ export default function App() {
const accepted = await isAgeGateAccepted(); const accepted = await isAgeGateAccepted();
setAgeAccepted(accepted); setAgeAccepted(accepted);
const creds = await loadCredentials(); const creds = await loadCredentials();
if (creds) setClient(new GoonClient(creds.baseUrl, creds.apiKey)); if (creds) {
setClient(new GoonClient(creds.baseUrl, creds.apiKey));
} else {
// No stored credentials → auto-connect to the public instance.
// LoginScreen only appears after an explicit "Sign out".
setClient(new GoonClient(DEFAULT_BACKEND_URL, DEFAULT_API_KEY));
}
const lockSettings = await getLockSettings(); const lockSettings = await getLockSettings();
if (lockSettings.enabled && lockSettings.hasPin) { if (lockSettings.enabled && lockSettings.hasPin) {
setLocked(true); setLocked(true);

View file

@ -93,8 +93,8 @@ android {
applicationId 'com.goon.mobile' applicationId 'com.goon.mobile'
minSdkVersion rootProject.ext.minSdkVersion minSdkVersion rootProject.ext.minSdkVersion
targetSdkVersion rootProject.ext.targetSdkVersion targetSdkVersion rootProject.ext.targetSdkVersion
versionCode 6 versionCode 9
versionName "0.1.6" versionName "0.1.9"
} }
signingConfigs { signingConfigs {
debug { debug {

View file

@ -16,15 +16,15 @@
</queries> </queries>
<application android:name=".MainApplication" android:label="@string/app_name" android:icon="@mipmap/ic_launcher" android:roundIcon="@mipmap/ic_launcher_round" android:allowBackup="true" android:theme="@style/AppTheme" android:supportsRtl="true" android:usesCleartextTraffic="false" android:networkSecurityConfig="@xml/network_security_config"> <application android:name=".MainApplication" android:label="@string/app_name" android:icon="@mipmap/ic_launcher" android:roundIcon="@mipmap/ic_launcher_round" android:allowBackup="true" android:theme="@style/AppTheme" android:supportsRtl="true" android:usesCleartextTraffic="false" android:networkSecurityConfig="@xml/network_security_config">
<!-- <!--
Expo Updates is disabled by default in the public source tree. To enable Expo Updates — ENABLED 2026-05-22 dla public release. Manifest serwowany
OTA updates for your fork, flip ENABLED to "true" and point EXPO_UPDATE_URL przez backend `/expo-updates/manifest` (api.goon-foss.org). Nowe JS-only
at your backend's `/expo-updates/manifest` endpoint. See README "Quick start" fixy idą OTA bez rebuilda APK; native change wymaga bumpa runtimeVersion
for the server-side setup. + nowego APK przez PackageInstaller.
--> -->
<meta-data android:name="expo.modules.updates.ENABLED" android:value="false"/> <meta-data android:name="expo.modules.updates.ENABLED" android:value="true"/>
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_CHECK_ON_LAUNCH" android:value="ALWAYS"/> <meta-data android:name="expo.modules.updates.EXPO_UPDATES_CHECK_ON_LAUNCH" android:value="ALWAYS"/>
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_LAUNCH_WAIT_MS" android:value="0"/> <meta-data android:name="expo.modules.updates.EXPO_UPDATES_LAUNCH_WAIT_MS" android:value="0"/>
<meta-data android:name="expo.modules.updates.EXPO_UPDATE_URL" android:value="https://invalid.example.invalid/expo-updates/manifest"/> <meta-data android:name="expo.modules.updates.EXPO_UPDATE_URL" android:value="https://api.goon-foss.org/expo-updates/manifest"/>
<meta-data android:name="expo.modules.updates.EXPO_RUNTIME_VERSION" android:value="1.0"/> <meta-data android:name="expo.modules.updates.EXPO_RUNTIME_VERSION" android:value="1.0"/>
<activity android:name=".MainActivity" android:configChanges="keyboard|keyboardHidden|orientation|screenSize|screenLayout|uiMode" android:launchMode="singleTask" android:windowSoftInputMode="adjustResize" android:theme="@style/Theme.App.SplashScreen" android:exported="true" android:screenOrientation="portrait"> <activity android:name=".MainActivity" android:configChanges="keyboard|keyboardHidden|orientation|screenSize|screenLayout|uiMode" android:launchMode="singleTask" android:windowSoftInputMode="adjustResize" android:theme="@style/Theme.App.SplashScreen" android:exported="true" android:screenOrientation="portrait">
<intent-filter> <intent-filter>

View file

@ -2,14 +2,14 @@
"expo": { "expo": {
"name": "goon", "name": "goon",
"slug": "goon", "slug": "goon",
"version": "0.1.8", "version": "0.1.9",
"orientation": "portrait", "orientation": "portrait",
"userInterfaceStyle": "automatic", "userInterfaceStyle": "automatic",
"newArchEnabled": false, "newArchEnabled": false,
"runtimeVersion": "1.0", "runtimeVersion": "1.0",
"updates": { "updates": {
"enabled": false, "enabled": true,
"url": "https://invalid.example.invalid/expo-updates/manifest", "url": "https://api.goon-foss.org/expo-updates/manifest",
"checkAutomatically": "ON_LOAD", "checkAutomatically": "ON_LOAD",
"fallbackToCacheTimeout": 0 "fallbackToCacheTimeout": 0
}, },

View file

@ -17,4 +17,4 @@ import Constants from 'expo-constants';
* też nie idzie do góry, więc consistency jest zachowana. * też nie idzie do góry, więc consistency jest zachowana.
*/ */
export const APP_VERSION: string = export const APP_VERSION: string =
(Constants.expoConfig?.version as string | undefined) || '0.1.8'; (Constants.expoConfig?.version as string | undefined) || '0.1.9';

12
mobile/src/lib/backend.ts Normal file
View file

@ -0,0 +1,12 @@
// Default public instance. A fresh install with no stored credentials
// auto-connects here, so the app works out-of-the-box without a login step.
//
// Power users who want their own self-hosted backend can still override:
// after "Sign out" the login screen lets them enter a different URL + key.
//
// The API key below is intentionally shipped in the APK. It is a coarse
// bot/scraper filter, not a secret — anyone can decompile the APK to read it.
// If it gets abused, rotate it: append a new key to API_KEYS on the server,
// ship an APK update, then drop the old key.
export const DEFAULT_BACKEND_URL = 'https://api.goon-foss.org';
export const DEFAULT_API_KEY = 'W20ggQgYjH_evCZCSBTWJsGgLMaJQP_7';

View file

@ -754,6 +754,40 @@ const INJECTED_JS = `
}; };
} catch (e) {} } catch (e) {}
// -- 1.5. Cookie/consent auto-dismiss --------------------------------------
// Tube'y typu hqporner mają cookie-consent gate ("Allow All / Allow Essential
// Only") który blokuje kt_player JS — player nie inicjalizuje się dopóki user
// nie kliknie. INJECTED_JS scrape \`<source>.src\` odpala się więc za wcześnie
// (DOM nie ma jeszcze video). Auto-klikamy consent żeby odblokować player.
//
// Bezpieczeństwo: klikamy TYLKO element którego tekst pasuje do consent-frazy
// ORAZ leży w kontenerze z markerem cookie/consent/gdpr (≤6 przodków). To
// wyklucza przypadkowy klik w reklamę "Continue to site".
const CONSENT_TEXT_RE = /^(allow all|accept all|accept|accept & continue|accept and continue|i accept|i agree|agree|agree & continue|got it|enable all|consent|continue|ok|akceptuj.*|zgadzam.*|zgoda|rozumiem|wyra(z|ż)am zgod)$/i;
const CONSENT_CTX_RE = /(cookie|consent|gdpr|privacy|cmp|onetrust|didomi|cookiebar|cookie-?notice)/i;
const dismissConsent = function() {
const els = document.querySelectorAll('button, a, [role="button"], div[onclick], span[onclick], input[type="button"], input[type="submit"]');
for (let i = 0; i < els.length; i++) {
const el = els[i];
const txt = ((el.textContent || el.value || '') + '').trim();
if (!txt || txt.length > 32) continue;
if (!CONSENT_TEXT_RE.test(txt)) continue;
// Kontekst: element lub ≤6 przodków ma cookie/consent marker (class/id).
let ctx = el, depth = 0, inCtx = false;
while (ctx && depth < 7) {
const cn = ctx.className;
const sig = ((typeof cn === 'string' ? cn : (cn && cn.baseVal) || '') + ' ' + (ctx.id || '')).toLowerCase();
if (CONSENT_CTX_RE.test(sig)) { inCtx = true; break; }
ctx = ctx.parentElement; depth++;
}
if (!inCtx) continue;
try {
el.click();
window.ReactNativeWebView.postMessage(JSON.stringify({type: 'consent_dismissed'}));
} catch (e) {}
}
};
// Niektóre hostery wstrzykują full-screen <iframe> jako ad — usuwamy periodically. // Niektóre hostery wstrzykują full-screen <iframe> jako ad — usuwamy periodically.
// Plus iframe-ad już istniejące przed naszym patchowaniem (race condition). // Plus iframe-ad już istniejące przed naszym patchowaniem (race condition).
const removeAdIframes = function() { const removeAdIframes = function() {
@ -778,7 +812,13 @@ const INJECTED_JS = `
} }
}); });
}; };
setInterval(removeAdIframes, 1000); setInterval(function() {
removeAdIframes();
dismissConsent();
}, 1000);
// Pierwsza próba consent natychmiast (banner bywa w SSR HTML) — bez czekania
// na pierwszy tick interwału.
dismissConsent();
// -- 2. Auto-extract m3u8/mp4 ----------------------------------------------- // -- 2. Auto-extract m3u8/mp4 -----------------------------------------------
const VIDEO_RE = /https?:\\/\\/[^"'\\s<>]+\\.(?:m3u8|mp4|mpd)(?:\\?[^"'\\s<>]*)?/i; const VIDEO_RE = /https?:\\/\\/[^"'\\s<>]+\\.(?:m3u8|mp4|mpd)(?:\\?[^"'\\s<>]*)?/i;
@ -821,9 +861,12 @@ const INJECTED_JS = `
} catch (e) {} } catch (e) {}
} }
}); });
// Jeśli mamy video URL i video się odpaliło, możemy zatrzymać polling // Jeśli mamy video URL i video się odpaliło, możemy zatrzymać polling.
if (seen.size > 0 && ticks > 5) clearInterval(interval); // Próg podniesiony 5→15: po auto-dismiss cookie consent kt_player (hqporner)
if (ticks > 60) clearInterval(interval); // potrzebuje kilku sekund na init — zbyt wczesny stop łapał tylko preroll-ad
// URL zanim pojawił się prawdziwy <source>. 15 ticków = ~15s retry window.
if (seen.size > 0 && ticks > 15) clearInterval(interval);
if (ticks > 90) clearInterval(interval);
}, 1000); }, 1000);
true; true;

View file

@ -0,0 +1,50 @@
"""Per-origin extractor check: dla 1 sample sceny z każdego tube origin,
wywołaj try_extract i sklasyfikuj wynik (direct mp4/m3u8 vs WebView hoster vs fail).
Uruchamiać na VPS: docker compose exec -T api python scripts/check_all_hosters.py
"""
from app.db import SessionLocal
from sqlalchemy import text
from app.extractors import try_extract
def main():
with SessionLocal() as s:
rows = s.execute(text("""
SELECT DISTINCT ON (ps.origin)
ps.origin, ps.page_url, sc.title
FROM playback_sources ps
JOIN scenes sc ON sc.id = ps.scene_id
WHERE ps.dead_at IS NULL AND ps.origin LIKE 'tube:%'
AND ps.page_url IS NOT NULL
ORDER BY ps.origin, sc.created_at DESC
""")).all()
print(f"{'origin':<26} {'result':<48} verdict")
print("-" * 95)
for r in rows:
sitetag = r.origin.replace("tube:", "")
try:
sources = try_extract(sitetag, r.page_url)
except Exception as e:
print(f"{r.origin:<26} EXC: {str(e)[:42]:<48} ERROR")
continue
if not sources:
print(f"{r.origin:<26} {'None (no sources)':<48} FAIL")
continue
# Klasyfikacja po type pierwszego źródła
types = [getattr(x, "type", "?") for x in sources]
first = sources[0]
t = getattr(first, "type", "?")
link = (getattr(first, "link", "") or "")[:40]
if t == "hoster":
verdict = "WEBVIEW (page → ad risk)"
elif t in ("mp4", "m3u8", "hls", "mpd"):
verdict = "DIRECT (native ExoPlayer)"
else:
verdict = f"OTHER({t})"
n = len(sources)
print(f"{r.origin:<26} {f'{t} x{n} {link}':<48} {verdict}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,29 @@
"""Quick sanity check series-mismatch detector na realnych pendingach z bazy."""
from app.resolve.scoring import (
detect_modifier_tags,
detect_series_positions,
series_mismatch_strength,
)
cases = [
("pleasureville a dp xxx parody episode 2", "pleasureville a dp xxx parody episode 4", "Episode 2/4"),
("make em sweat #7", "make em sweat #7 bts", "BTS asymmetric"),
("training ravyn", "training ravyn (bts - 1)", "BTS asymmetric"),
("women seeking women volume 140 scene 3", "women seeking women volume 140 scene 4", "Vol same scene diff"),
("women seeking women #131 scene 2", "women seeking women volume 139 scene 1", "Multi num"),
("bad bella stinky feet preparation 1080p", "bad bella stinky feet preparation (unedited) 1080p", "Unedited"),
("alexis fawx step son becomes a man part 1", "alexis fawx step son becomes a man part 2", "Part 1/2"),
("neon moonlight pt. 1", "neon moonlight pt. 2", "Pt 1/2"),
("internet outage poundage", "internet outage poundage alexis fawx", "Same scene"),
("the great heist", "the great heist", "Identical"),
("training ravyn", "training ravyn", "Identical"),
("slut hunt ep.6 ravyn", "slut hunt ep.6 ravyn full", "Same Episode 6"),
]
print(f'{"strength":>8} case')
for a, b, desc in cases:
s = series_mismatch_strength(a, b)
pa = detect_series_positions(a)
pb = detect_series_positions(b)
ma = detect_modifier_tags(a)
mb = detect_modifier_tags(b)
print(f'{s:>8.2f} {desc:25s} pos={pa or "{}"} vs {pb or "{}"} mod={ma or "{}"} vs {mb or "{}"}')

View file

@ -0,0 +1,92 @@
"""Debug reverse-proxy: http://0.0.0.0:8099 → https://api.goon-foss.org
Emulator app (via http://10.0.2.2:8099, cleartext dozwolony w NSC dla 10.0.2.2)
uderza ten proxy forward do prawdziwego backendu. Loguje każdy request:
method, path, headers (X-API-Key, X-App-Signature), response status.
Cel: zdiagnozować czy app fetch w ogóle działa + jakie headers wysyła.
"""
import http.server
import socketserver
import ssl
import urllib.request
import urllib.error
UPSTREAM = "https://api.goon-foss.org"
PORT = 8099
class ProxyHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"
def _proxy(self, method):
body_len = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(body_len) if body_len else None
print(f"\n>>> {method} {self.path}")
for h in ("X-API-Key", "X-App-Signature", "Authorization", "User-Agent", "Accept", "Content-Type"):
if h in self.headers:
val = self.headers[h]
if h == "X-App-Signature":
print(f" {h}: {val[:20]}...{val[-8:]} (len={len(val)})")
elif h == "X-API-Key":
print(f" {h}: {val[:8]}... (len={len(val)})")
else:
print(f" {h}: {val}")
url = UPSTREAM + self.path
req = urllib.request.Request(url, data=body, method=method)
for k, v in self.headers.items():
if k.lower() not in ("host", "content-length", "connection", "accept-encoding"):
req.add_header(k, v)
ctx = ssl.create_default_context()
try:
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp:
data = resp.read()
print(f"<<< {resp.status} ({len(data)} bytes)")
self.send_response(resp.status)
for k, v in resp.headers.items():
if k.lower() not in ("transfer-encoding", "connection", "content-encoding", "content-length"):
self.send_header(k, v)
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
except urllib.error.HTTPError as e:
data = e.read()
print(f"<<< HTTP {e.code} ({len(data)} bytes): {data[:200]}")
self.send_response(e.code)
self.send_header("Content-Type", e.headers.get("Content-Type", "application/json"))
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
except Exception as e:
print(f"<<< PROXY ERROR: {type(e).__name__}: {e}")
msg = f'{{"detail":"proxy error: {e}"}}'.encode()
self.send_response(502)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(msg)))
self.end_headers()
self.wfile.write(msg)
def do_GET(self):
self._proxy("GET")
def do_POST(self):
self._proxy("POST")
def do_DELETE(self):
self._proxy("DELETE")
def log_message(self, *args):
pass # silence default logging
class ThreadingServer(socketserver.ThreadingMixIn, http.server.HTTPServer):
daemon_threads = True
if __name__ == "__main__":
print(f"debug proxy: http://0.0.0.0:{PORT} -> {UPSTREAM}")
print(f"emulator app should point to http://10.0.2.2:{PORT}")
ThreadingServer(("0.0.0.0", PORT), ProxyHandler).serve_forever()

View file

@ -0,0 +1,59 @@
"""Smoke test PornDoeScraper — fetch sample + sprawdz parsing."""
import logging
logging.basicConfig(level=logging.INFO)
from app.connectors.direct_scrapers.porndoe import PornDoeScraper
def main():
scraper = PornDoeScraper()
print(f"sitetag: {scraper.sitetag}")
print(f"listing url p1: {scraper._listing_url(1)}")
print(f"listing url p2: {scraper._listing_url(2)}")
print()
count = 0
ok_studio = ok_perf = ok_date = ok_dur = ok_thumb = ok_phash = 0
for scene in scraper.latest_scenes(max_pages=1):
count += 1
if scene.studio:
ok_studio += 1
if scene.performers:
ok_perf += 1
if scene.release_date:
ok_date += 1
if scene.duration_sec:
ok_dur += 1
if scene.playback_sources and scene.playback_sources[0].thumbnail_url:
ok_thumb += 1
if scene.fingerprints:
ok_phash += 1
if count <= 5:
print(f"--- scene {count} ---")
print(f" ext_id: {scene.external_id}")
print(f" title: {scene.title[:60]}")
print(f" studio: {scene.studio.name if scene.studio else None}")
print(f" perf: {[p.name for p in scene.performers]}")
print(f" date: {scene.release_date}")
print(f" duration: {scene.duration_sec}s")
print(f" tags: {[t.name for t in scene.tags][:5]}")
print(f" thumb: {(scene.playback_sources[0].thumbnail_url or '')[:70]}")
print(f" phash: {[f.value for f in scene.fingerprints]}")
print()
if count >= 15:
break
print("=" * 50)
print(f"total scraped: {count}")
if count:
print(f" studio: {ok_studio}/{count} ({100*ok_studio//count}%)")
print(f" performer: {ok_perf}/{count} ({100*ok_perf//count}%)")
print(f" date: {ok_date}/{count} ({100*ok_date//count}%)")
print(f" duration: {ok_dur}/{count} ({100*ok_dur//count}%)")
print(f" thumbnail: {ok_thumb}/{count} ({100*ok_thumb//count}%)")
print(f" phash: {ok_phash}/{count} ({100*ok_phash//count}%)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,29 @@
"""Coverage check: ile tube'ów z theporndude.com mamy już w bazie."""
from app.db import SessionLocal
from sqlalchemy import text
def main():
with SessionLocal() as s:
# Wszystkie distinct origins (canonical + tube: + pornapp:)
rows = s.execute(text("""
SELECT origin, COUNT(*) AS n,
COUNT(*) FILTER (WHERE dead_at IS NULL) AS live,
COUNT(*) FILTER (WHERE dead_at IS NOT NULL) AS dead
FROM playback_sources
GROUP BY origin
ORDER BY origin
""")).all()
print(f"distinct origins: {len(rows)}")
by_kind = {}
for r in rows:
kind = r.origin.split(":")[0] if ":" in r.origin else "other"
by_kind.setdefault(kind, []).append((r.origin, r.n, r.live, r.dead))
for kind, items in by_kind.items():
print(f"\n=== {kind} ({len(items)} origins) ===")
for origin, n, live, dead in sorted(items, key=lambda x: -x[2]):
print(f" {origin:<35} n={n:>7,} live={live:>7,} dead={dead:>5,}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,118 @@
"""Cross-check 166 resolved theporndude domains vs nasze 25 tube origins."""
import json
from pathlib import Path
# Origins z DB (live + dead) + extractor REGISTRY w app/extractors/__init__.py
OUR_ORIGINS = [
# DB live + dead
"tube:0dayxxcom", "tube:epornercom", "tube:fpoxxx", "tube:freshpornoorg",
"tube:hqpornercom", "tube:latestpornvideocom", "tube:mypornerleakcom",
"tube:perverzijacom", "tube:porn00org", "tube:porndishcom", "tube:porndittcom",
"tube:pornhatcom", "tube:pornhubcom", "tube:porntrexcom", "tube:pornxpph",
"tube:redtubecom", "tube:sxylandcom", "tube:sxyprncom", "tube:xhamstercom",
"tube:xnxxcom", "tube:xvideoscom", "tube:youporncom", "tube:latestleaksco",
"tube:siskavideo", "tube:hdporn92com",
# REGISTRY only (extractor known, brak playback w live DB)
"tube:xmoviesforyoucom", "tube:watchporn", "tube:porn4dayspw",
"tube:paradisehillcc",
]
# Tylko realne TLD-y. NIE "tube"/"porn"/"xxx" bo to często części nazwy (redtube, pornhub, fpoxxx).
_TLD_RE = __import__("re").compile(r"(com|net|org|tv|cc|pw|co|to|ws|me|sx|info|biz)$")
def _strip_tld(s: str) -> str:
"""xvideoscom -> xvideos; pornhubcom -> pornhub; hdporn92com -> hdporn92"""
return _TLD_RE.sub("", s)
# Build sitetag → matching variants for fuzzy match
def origin_to_sitetag(origin: str) -> str:
return origin.replace("tube:", "")
def domain_to_sitetag(domain: str) -> str:
"""xvideos.com -> xvideoscom, porntrex.com -> porntrexcom"""
return domain.lower().replace(".", "").replace("-", "")
def match(slug: str, domain: str) -> str | None:
"""Match po `slug` (z theporndude review URL) lub `real_domain` (z pdude.link).
Slug to nazwa tube'a (np. 'xvideos', 'pornhub', 'paradisehill').
Origin format: tube:<sitetag>, gdzie sitetag = domain.replace('.', '').
Match na "slug pasuje do sitetag bez TLD" daje dobry recall.
"""
candidates = []
if slug:
candidates.append(slug.lower().replace("-", ""))
if domain:
candidates.append(domain_to_sitetag(domain))
if not candidates:
return None
for o in OUR_ORIGINS:
st = origin_to_sitetag(o)
st_no_tld = _strip_tld(st)
for c in candidates:
c_no_tld = _strip_tld(c)
if c_no_tld == st_no_tld and len(c_no_tld) >= 3:
return o
return None
def main():
data = json.loads(Path("theporndude_resolved.json").read_text())
have = []
new = []
error = []
for r in data:
if "error" in r and not r.get("real_domain"):
error.append(r)
continue
domain = r.get("real_domain", "")
our = match(r.get("slug", ""), domain)
r["our_origin"] = our
if our:
have.append(r)
else:
new.append(r)
print(f"=== Coverage ===")
print(f"Total theporndude top-porn-tubes: {len(data)}")
print(f" Already in our DB: {len(have)}")
print(f" NEW (potential candidates): {len(new)}")
print(f" Errors: {len(error)}")
print()
print(f"=== Already have (matched) — top 30 by theporndude rank ===")
for r in sorted(have, key=lambda x: x["rank"])[:30]:
print(
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
f"{r['real_domain']:<28} -> {r['our_origin']}"
)
print()
print(f"=== NEW candidates (not in DB) — top 60 by theporndude rank ===")
for r in sorted(new, key=lambda x: x["rank"])[:60]:
print(
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
f"{r.get('real_domain') or '?':<30} ({r['slug']})"
)
# Output detailed
summary = {
"total": len(data),
"already_have": [{"rank": r["rank"], "slug": r["slug"], "domain": r["real_domain"],
"score": r.get("theporndude_score"), "our_origin": r["our_origin"]}
for r in sorted(have, key=lambda x: x["rank"])],
"new_candidates": [{"rank": r["rank"], "slug": r["slug"], "domain": r.get("real_domain"),
"score": r.get("theporndude_score"),
"final_url": r.get("final_url", "")}
for r in sorted(new, key=lambda x: x["rank"])],
"errors": [{"rank": r["rank"], "slug": r["slug"], "error": r.get("error")}
for r in error],
}
Path("theporndude_coverage.json").write_text(json.dumps(summary, indent=2))
print(f"\n-> theporndude_coverage.json")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,176 @@
"""Batch curl triage 144 nowych theporndude tubes:
- HEAD root domain (200/4xx/5xx/timeout?)
- GET / check landing markers: video listing, sceny, login wall, redirect
- GET /latest, /videos, /tube/recent check które listing path działa
- Wynik: per-slug status + landing markers + scene_url_pattern guess
"""
import asyncio
import json
import re
from pathlib import Path
from urllib.parse import urlparse
import httpx
COVERAGE_FILE = Path("theporndude_coverage.json")
OUT_FILE = Path("theporndude_triage.json")
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
# Common candidate landing paths
LANDING_PATHS = ["/", "/latest", "/latest-videos", "/recent", "/new", "/videos", "/category/new", "/top-rated"]
# Markers w HTML
SCENE_LINK_PATTERNS = [
r'<a[^>]+href="(/(?:video|videos|watch|v|scene|scenes|stream|movie|movies|view|play|porn|tube)/[^"]+)"',
r'<a[^>]+href="((?:https?:)?//[^/"]+/(?:video|videos|watch|v|scene|scenes|stream|movie|movies|view|play|porn|tube)/[^"]+)"',
]
META_MARKERS = [
(r'jsonld|json-ld|"@type"\s*:\s*"VideoObject"', "jsonld_video"),
(r'<meta\s+property="og:type"\s+content="video', "og_video"),
(r'<meta\s+name="description"\s+content="([^"]+)"', "meta_desc"),
(r'class="[^"]*\b(?:video|scene|episode)-?(?:item|card|tile|thumb)\b', "video_card"),
(r'class="[^"]*\b(?:performer|actress|model|pornstar)\b', "performer_marker"),
(r'class="[^"]*\b(?:studio|production|brand|channel)\b', "studio_marker"),
(r'class="[^"]*\b(?:duration|runtime|length)\b|<time\s+datetime=', "duration_marker"),
(r'\b(?:HLS|hls|m3u8|application/x-mpegURL)\b', "hls_marker"),
(r'(?:hlsmanifest|videoUrl|video_url|stream_url|streamUrl)\s*[:=]\s*["\']', "stream_url_marker"),
(r'login\s*required|create\s+account|sign\s+(?:in|up)|members\s+only|join\s+now\s+to\s+watch', "auth_wall"),
(r'<title>[^<]*\b(?:404|not\s+found|gone|domain)\b[^<]*</title>', "dead_404"),
(r'<meta[^>]+http-equiv="refresh"[^>]+url=', "meta_refresh"),
]
async def fetch_one(cli: httpx.AsyncClient, url: str) -> tuple[int, str]:
try:
r = await cli.get(url, headers={"User-Agent": UA}, follow_redirects=True)
return r.status_code, r.text[:200_000] # cap response
except httpx.ConnectError:
return -1, "conn_refused"
except httpx.TimeoutException:
return -2, "timeout"
except Exception as e:
return -9, str(e)[:120]
def analyze_html(html: str) -> dict:
found = {}
for pattern, name in META_MARKERS:
if re.search(pattern, html, re.IGNORECASE):
found[name] = True
# Scene link patterns
scene_links = []
for p in SCENE_LINK_PATTERNS:
for m in re.finditer(p, html, re.IGNORECASE):
scene_links.append(m.group(1)[:120])
if len(scene_links) >= 5:
break
if len(scene_links) >= 5:
break
if scene_links:
found["scene_link_samples"] = scene_links[:3]
# Unique pattern (path prefix po slash)
prefixes = set()
for link in scene_links:
parts = link.lstrip("/").split("/", 2)
if parts:
prefixes.add("/" + parts[0])
found["scene_path_prefixes"] = sorted(prefixes)
return found
async def audit_one(cli: httpx.AsyncClient, slug: str, domain: str) -> dict:
"""Audit pojedynczego tube'a."""
out = {"slug": slug, "domain": domain}
# Próbuj https://<domain>/ root
if not domain or not re.match(r"^[\w\.-]+\.\w+$", domain):
out["error"] = "no_valid_domain"
return out
root_url = f"https://{domain}/"
status, html = await fetch_one(cli, root_url)
out["root_status"] = status
if status not in (200, 301, 302):
out["root_error"] = html[:80] if isinstance(html, str) else None
return out
out["root_findings"] = analyze_html(html)
# Heurystyka score 0-3
f = out["root_findings"]
score = 0
reasons = []
if f.get("jsonld_video"):
score += 1
reasons.append("jsonld_video")
if f.get("og_video"):
score += 1
reasons.append("og_video")
if f.get("video_card"):
score += 1
reasons.append("video_card")
if f.get("performer_marker"):
score += 1
reasons.append("performer_marker")
if f.get("studio_marker"):
score += 1
reasons.append("studio_marker")
if f.get("duration_marker"):
score += 0.5
if f.get("hls_marker") or f.get("stream_url_marker"):
score += 0.5
if f.get("scene_path_prefixes"):
score += 1
reasons.append(f"scene_paths={f['scene_path_prefixes']}")
if f.get("auth_wall"):
score -= 2
reasons.append("auth_wall")
if f.get("dead_404"):
score -= 5
reasons.append("dead_404")
if f.get("meta_refresh"):
score -= 1
reasons.append("meta_refresh")
out["heuristic_score"] = round(score, 1)
out["reasons"] = reasons
return out
async def main():
cov = json.loads(COVERAGE_FILE.read_text())
new_candidates = cov["new_candidates"]
print(f"audytuję {len(new_candidates)} nowych kandydatów…")
timeout = httpx.Timeout(15.0, connect=8.0)
limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
async with httpx.AsyncClient(timeout=timeout, limits=limits, http2=False) as cli:
sem = asyncio.Semaphore(12)
async def worker(r):
async with sem:
# Use slug or guess domain (most slug.com)
domain = r.get("domain") or ""
# Jeśli pdude.link daje porndudecams.com (interstitial), użyj <slug>.com
if not domain or "porndudecams" in domain:
domain = f"{r['slug'].lower()}.com"
return {**r, **(await audit_one(cli, r["slug"], domain))}
results = await asyncio.gather(*[worker(r) for r in new_candidates])
OUT_FILE.write_text(json.dumps(results, indent=2))
# Stats
by_score = {}
for r in results:
s = r.get("heuristic_score", 0)
bucket = "5+" if s >= 5 else "3-5" if s >= 3 else "1-3" if s >= 1 else "<1"
by_score.setdefault(bucket, []).append(r)
print("\n=== Heurystyczny rozkład (canonical-fit) ===")
for b in ["5+", "3-5", "1-3", "<1"]:
if b in by_score:
print(f" {b:<5} {len(by_score[b])} tubów")
print(f"\n-> {OUT_FILE}")
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,234 @@
"""Pełny pipeline dla theporndude /full-porn-movies-sites (94 tubes):
1. Resolve real domains (pdude.link follow, ale follow only 1 hop)
2. Coverage match vs nasze 25+ origins
3. Curl triage HTML markers
4. Per-tube scorecard
"""
import asyncio
import json
import re
from pathlib import Path
from urllib.parse import urlparse
import httpx
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
OUR_ORIGINS = [
"tube:0dayxxcom", "tube:epornercom", "tube:fpoxxx", "tube:freshpornoorg",
"tube:hqpornercom", "tube:latestpornvideocom", "tube:mypornerleakcom",
"tube:perverzijacom", "tube:porn00org", "tube:porndishcom", "tube:porndittcom",
"tube:pornhatcom", "tube:pornhubcom", "tube:porntrexcom", "tube:pornxpph",
"tube:redtubecom", "tube:sxylandcom", "tube:sxyprncom", "tube:xhamstercom",
"tube:xnxxcom", "tube:xvideoscom", "tube:youporncom", "tube:latestleaksco",
"tube:siskavideo", "tube:hdporn92com",
"tube:xmoviesforyoucom", "tube:watchporn", "tube:porn4dayspw",
"tube:paradisehillcc",
]
_TLD_RE = re.compile(r"(com|net|org|tv|cc|pw|co|to|ws|me|sx|info|biz)$")
def _strip_tld(s: str) -> str:
return _TLD_RE.sub("", s)
def match(slug: str, domain: str) -> str | None:
candidates = []
if slug:
candidates.append(slug.lower().replace("-", ""))
if domain:
candidates.append(domain.lower().replace(".", "").replace("-", ""))
for o in OUR_ORIGINS:
st = o.replace("tube:", "")
st_no_tld = _strip_tld(st)
for c in candidates:
c_no_tld = _strip_tld(c)
if c_no_tld == st_no_tld and len(c_no_tld) >= 3:
return o
return None
SCENE_PATH_RE = re.compile(
r'<a[^>]+href="((?:https?:)?//?[^"]*?/(?:video|videos|watch|v|scene|movie|movies|play|view|stream)/[^"]+)"',
re.IGNORECASE,
)
META_MARKERS = [
(r'"@type"\s*:\s*"VideoObject"', "jsonld_video"),
(r'<meta\s+property="og:type"\s+content="video', "og_video"),
(r'class="[^"]*\b(?:video|scene|movie|episode)-?(?:item|card|tile|thumb|block)\b', "video_card"),
(r'class="[^"]*\b(?:performer|actress|model|pornstar|cast)\b|href="[^"]*/pornstar', "performer_marker"),
(r'class="[^"]*\b(?:studio|production|brand|channel|network)\b|href="[^"]*/studio', "studio_marker"),
(r'class="[^"]*\b(?:duration|runtime|length)\b|itemprop="duration"', "duration_marker"),
(r'\b(?:HLS|m3u8|application/x-mpegURL)\b', "hls_marker"),
(r'(?:videoUrl|video_url|stream_url|streamUrl)\s*[:=]\s*["\']', "stream_url_marker"),
(r'(?:login\s+required|create\s+account|members\s+only|join\s+now)', "auth_wall"),
(r'<title>[^<]*\b(?:404|not\s+found|domain\s+for\s+sale|gone)\b', "dead_404"),
]
async def fetch_one(cli: httpx.AsyncClient, url: str, *, max_redirects: int = 5) -> tuple[int, str, str]:
try:
r = await cli.get(url, headers={"User-Agent": UA}, follow_redirects=False)
# Follow up to max_redirects but stop on cross-domain redirect-out (to detect pdude.link → ad)
hops = 0
first_external_domain = None
cur = r
cur_url = url
while cur.status_code in (301, 302, 303, 307, 308) and hops < max_redirects:
loc = cur.headers.get("location")
if not loc:
break
if loc.startswith("/"):
p = urlparse(cur_url)
loc = f"{p.scheme}://{p.netloc}{loc}"
cur_url = loc
hops += 1
# Track first external (non-pdude, non-theporndude)
host = urlparse(loc).hostname or ""
if first_external_domain is None and not host.endswith("pdude.link") and not host.endswith("theporndude.com"):
first_external_domain = host.replace("www.", "")
cur = await cli.get(loc, headers={"User-Agent": UA}, follow_redirects=False)
return cur.status_code, cur.text[:200_000] if hasattr(cur, "text") else "", first_external_domain or (urlparse(cur_url).hostname or "").replace("www.", "")
except httpx.ConnectError:
return -1, "conn_refused", ""
except httpx.TimeoutException:
return -2, "timeout", ""
except Exception as e:
return -9, str(e)[:120], ""
async def resolve_domain(cli: httpx.AsyncClient, slug: str) -> str:
"""Pdude.link follow z early-exit dla first external."""
try:
r = await cli.get(f"https://pdude.link/{slug}", headers={"User-Agent": UA}, follow_redirects=False)
loc = r.headers.get("location", "")
if loc:
host = urlparse(loc).hostname or ""
host = host.replace("www.", "")
# Jeśli pdude.link redirectuje na affiliate (anexo.link/awejmp.com/etc) — wyciągnij subAffId
if "anexo.link" in host or "awejmp.com" in host or "porndudecams" in host:
# Try slug.com fallback
return ""
return host
except Exception:
pass
return ""
def analyze_html(html: str) -> dict:
found = {}
for pattern, name in META_MARKERS:
if re.search(pattern, html, re.IGNORECASE):
found[name] = True
prefixes = set()
sample = []
for m in SCENE_PATH_RE.finditer(html):
link = m.group(1)
sample.append(link[:100])
# Wyciągnij prefix
# Normalize: //host/path → /path; otherwise full match
if link.startswith("//"):
link = "/" + link.split("/", 3)[3] if "/" in link[2:] else "/"
if link.startswith("/"):
parts = link.lstrip("/").split("/", 2)
if parts:
prefixes.add("/" + parts[0])
if len(sample) >= 5:
break
if prefixes:
found["scene_path_prefixes"] = sorted(prefixes)
if sample:
found["scene_link_samples"] = sample[:3]
return found
def score_findings(f: dict) -> tuple[float, list]:
score, reasons = 0.0, []
if f.get("jsonld_video"):
score += 1.5; reasons.append("jsonld_video")
if f.get("og_video"):
score += 0.5; reasons.append("og_video")
if f.get("video_card"):
score += 1; reasons.append("video_card")
if f.get("performer_marker"):
score += 1; reasons.append("performer_marker")
if f.get("studio_marker"):
score += 1; reasons.append("studio_marker")
if f.get("duration_marker"):
score += 0.5; reasons.append("duration_marker")
if f.get("hls_marker") or f.get("stream_url_marker"):
score += 0.5
if f.get("scene_path_prefixes"):
score += 1; reasons.append(f"paths={f['scene_path_prefixes']}")
if f.get("auth_wall"):
score -= 2; reasons.append("auth_wall")
if f.get("dead_404"):
score -= 5; reasons.append("dead_404")
return round(score, 1), reasons
async def main():
movies = json.loads(Path("theporndude_movies.json").read_text())["all"]
print(f"audyt {len(movies)} tubów z full-porn-movies-sites…")
timeout = httpx.Timeout(15.0, connect=8.0)
async with httpx.AsyncClient(timeout=timeout, http2=False) as cli:
sem = asyncio.Semaphore(12)
async def worker(r):
async with sem:
slug = r["slug"]
# Resolve real domain z pdude.link first hop
domain = await resolve_domain(cli, slug)
if not domain or any(x in domain for x in ["anexo.link", "awejmp.com", "porndudecams"]):
domain = f"{slug.lower()}.com"
# Curl root + scene path heurystyka
status, html, _ = await fetch_one(cli, f"https://{domain}/")
findings = analyze_html(html) if status == 200 else {}
score, reasons = score_findings(findings)
our = match(slug, domain)
return {
**r,
"domain": domain,
"root_status": status,
"findings": findings,
"score": score,
"reasons": reasons,
"our_origin": our,
}
results = await asyncio.gather(*[worker(r) for r in movies])
# Aggregate
have = [r for r in results if r["our_origin"]]
new_promising = [r for r in results if not r["our_origin"] and r["score"] >= 2.5]
new_low = [r for r in results if not r["our_origin"] and 1 <= r["score"] < 2.5]
new_zero = [r for r in results if not r["our_origin"] and 0 < r["score"] < 1]
new_dead = [r for r in results if not r["our_origin"] and (r["root_status"] <= 0 or r["score"] < 0)]
new_no_signal = [r for r in results if not r["our_origin"] and r["score"] == 0 and r["root_status"] == 200]
print(f"\n=== Coverage /full-porn-movies-sites ({len(results)} tubes) ===")
print(f" already have: {len(have):>3}")
print(f" promising: {len(new_promising):>3}")
print(f" low value: {len(new_low):>3}")
print(f" no signal: {len(new_no_signal):>3}")
print(f" dead: {len(new_dead):>3}")
print()
print("ALREADY HAVE:")
for r in have:
print(f" {r['slug']:<20} -> {r['our_origin']}")
print()
print("PROMISING (score >= 2.5):")
for r in sorted(new_promising, key=lambda x: -x["score"]):
print(f" score={r['score']:>4} {r['domain']:<25} ({r['slug']:<20}) reasons={','.join(r['reasons'])[:60]}")
print()
print("LOW VALUE (1-2.5):")
for r in sorted(new_low, key=lambda x: -x["score"]):
print(f" score={r['score']:>4} {r['domain']:<25} ({r['slug']:<20}) reasons={','.join(r['reasons'])[:60]}")
Path("theporndude_movies_scorecard.json").write_text(json.dumps(results, indent=2))
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,87 @@
"""Per 166 review slugs z top-porn-tube-sites:
1. Fetch review page extract pdude.link Visit URL + rating + score badges
2. Follow pdude.link real tube domain
3. Cross-check vs nasze 25 tube origins
4. Output JSON: { slug, name, theporndude_rank, theporndude_score, real_domain, in_our_db, our_origin }
"""
import asyncio
import json
import re
from pathlib import Path
from urllib.parse import urlparse
import httpx
REVIEWS_FILE = Path("theporndude_free_tubes.json")
OUT_FILE = Path("theporndude_resolved.json")
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
async def fetch_review(cli: httpx.AsyncClient, review: dict, rank: int) -> dict:
url = f"https://theporndude.com/{review['id']}/{review['slug']}"
try:
r = await cli.get(url, headers={"User-Agent": UA})
html = r.text
except Exception as e:
return {**review, "rank": rank, "error": f"fetch_review: {e}"}
# Wyciągnij score
score_m = re.search(r'class="rate__num">\s*(\d+(?:\.\d+)?)\s*<', html)
# Wyciągnij pdude.link visit URL
pdude_m = re.search(r'href="(https://pdude\.link/[\w\-\.]+)"', html)
# Wyciągnij <title> + meta description
title_m = re.search(r"<title>([^<]+)</title>", html)
desc_m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html)
out = {
**review,
"rank": rank,
"theporndude_score": float(score_m.group(1)) if score_m else None,
"page_title": (title_m.group(1) if title_m else "")[:120],
"page_desc": (desc_m.group(1) if desc_m else "")[:200],
}
if not pdude_m:
out["error"] = "no_pdude_link"
return out
pdude_url = pdude_m.group(1)
# Follow pdude.link
try:
r2 = await cli.get(pdude_url, headers={"User-Agent": UA})
# Final URL po wszystkich redirectach
final_url = str(r2.url)
host = urlparse(final_url).hostname or ""
host = host.replace("www.", "")
out["real_domain"] = host
out["final_url"] = final_url[:200]
except Exception as e:
out["error"] = f"pdude_follow: {e}"
return out
async def main():
reviews = json.loads(REVIEWS_FILE.read_text())["reviews"]
timeout = httpx.Timeout(20.0, connect=10.0)
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
async with httpx.AsyncClient(
timeout=timeout, limits=limits, follow_redirects=True, http2=False
) as cli:
sem = asyncio.Semaphore(8)
async def worker(rev, rank):
async with sem:
return await fetch_review(cli, rev, rank)
tasks = [worker(r, i + 1) for i, r in enumerate(reviews)]
results = await asyncio.gather(*tasks)
OUT_FILE.write_text(json.dumps(results, indent=2))
ok = sum(1 for r in results if r.get("real_domain"))
print(f"resolved {ok}/{len(results)} ({ok*100/len(results):.0f}%)")
print(f"out -> {OUT_FILE}")
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,104 @@
"""Generuje końcowy scorecard JSON dla wszystkich 166 theporndude top-porn-tube-sites:
- coverage status (already_have/new/dead/low_value)
- canonical_value_score 0-5 (heurystyka + nasz ranking)
- recommendation: skip / consider / pilot / integrate
Plus markdown summary dla człowieka.
"""
import json
from pathlib import Path
COVERAGE = json.loads(Path("theporndude_coverage.json").read_text())
TRIAGE = json.loads(Path("theporndude_triage.json").read_text())
def main():
triage_by_slug = {r["slug"]: r for r in TRIAGE}
scorecards = []
for r in COVERAGE["already_have"]:
scorecards.append({
"rank": r["rank"],
"slug": r["slug"],
"domain": r["domain"],
"status": "already_have",
"our_origin": r["our_origin"],
"canonical_value_score": None,
"recommendation": "skip — already integrated",
})
for r in COVERAGE["new_candidates"]:
t = triage_by_slug.get(r["slug"], {})
score = t.get("heuristic_score", 0)
findings = t.get("root_findings", {})
reasons = t.get("reasons", [])
root_status = t.get("root_status", 0)
domain = t.get("domain") or r.get("domain") or f"{r['slug']}.com"
if root_status <= 0 or findings.get("dead_404"):
status = "dead"
rec = "skip — dead/unreachable"
elif findings.get("auth_wall") and score < 2:
status = "auth_wall"
rec = "skip — login required, no public scenes"
elif score >= 2.5:
status = "promising"
rec = "pilot — deep audit + write extractor"
elif score >= 1:
status = "low_value"
rec = "consider — basic metadata only, low priority"
else:
status = "no_value"
rec = "skip — no canonical-fit signal in HTML"
scorecards.append({
"rank": r["rank"],
"slug": r["slug"],
"domain": domain,
"status": status,
"our_origin": None,
"canonical_value_score": score,
"heuristic_reasons": reasons,
"findings": findings,
"recommendation": rec,
})
scorecards.sort(key=lambda x: x["rank"])
out = {
"source": "theporndude.com/top-porn-tube-sites",
"fetched_at": "2026-05-20",
"total": len(scorecards),
"summary": {
"already_have": sum(1 for s in scorecards if s["status"] == "already_have"),
"promising": sum(1 for s in scorecards if s["status"] == "promising"),
"low_value": sum(1 for s in scorecards if s["status"] == "low_value"),
"no_value": sum(1 for s in scorecards if s["status"] == "no_value"),
"auth_wall": sum(1 for s in scorecards if s["status"] == "auth_wall"),
"dead": sum(1 for s in scorecards if s["status"] == "dead"),
},
"scorecards": scorecards,
}
Path("theporndude_scorecard.json").write_text(json.dumps(out, indent=2))
# Pretty print summary
print("=" * 70)
print(f"THEPORNDUDE.COM CANONICAL-FIT SCORECARD ({out['total']} tubes)")
print("=" * 70)
for k, v in out["summary"].items():
print(f" {k:<15} {v:>4} ({100*v/out['total']:.0f}%)")
print()
print("PROMISING (score >= 2.5) — pilot candidates:")
for s in scorecards:
if s["status"] == "promising":
r = ",".join(s.get("heuristic_reasons", []))[:60]
print(f" #{s['rank']:>3} score={s['canonical_value_score']:>4} {s['domain']:<25} ({s['slug']}) {r}")
print()
print("LOW_VALUE (1-2.5) — defer:")
for s in scorecards:
if s["status"] == "low_value":
r = ",".join(s.get("heuristic_reasons", []))[:50]
print(f" #{s['rank']:>3} score={s['canonical_value_score']:>4} {s['domain']:<25} ({s['slug']}) {r}")
if __name__ == "__main__":
main()

View file

@ -9,9 +9,12 @@ import pytest
from app.resolve.scoring import ( from app.resolve.scoring import (
composite_score, composite_score,
date_proximity, date_proximity,
detect_modifier_tags,
detect_series_positions,
hamming_distance_hex, hamming_distance_hex,
performer_set_similarity, performer_set_similarity,
phash_similarity, phash_similarity,
series_mismatch_strength,
title_similarity, title_similarity,
triage, triage,
) )
@ -146,6 +149,112 @@ def test_composite_clamps_to_unit() -> None:
assert score == 1.0 assert score == 1.0
# ---- triage --------------------------------------------------------------
# ---- series position / modifier detector ---------------------------------
def test_detect_series_positions_episode() -> None:
assert detect_series_positions("pleasureville a dp xxx parody episode 4") == {4}
def test_detect_series_positions_part_with_dot() -> None:
assert detect_series_positions("neon moonlight pt. 2") == {2}
def test_detect_series_positions_hash_only() -> None:
assert detect_series_positions("women seeking women #131 scene 2") == {131, 2}
def test_detect_series_positions_volume() -> None:
assert detect_series_positions("women seeking women volume 140 scene 3") == {140, 3}
def test_detect_series_positions_s_e_style() -> None:
assert detect_series_positions("can you handle a woman like me s9 e8") == {9, 8}
def test_detect_series_positions_empty() -> None:
assert detect_series_positions(None) == set()
assert detect_series_positions("") == set()
def test_detect_modifier_tags_bts() -> None:
assert "bts" in detect_modifier_tags("training ravyn (bts - 1)")
def test_detect_modifier_tags_behind_the_scenes() -> None:
assert "behind the scenes" in detect_modifier_tags(
"behind the scenes - two pairs of suckable melons"
)
def test_detect_modifier_tags_unedited() -> None:
assert "unedited" in detect_modifier_tags("bad bella stinky feet prep (unedited)")
def test_series_mismatch_episode_2_vs_4_hard() -> None:
# Episode 2 vs 4 → twardy mismatch (1.0)
s = series_mismatch_strength(
"pleasureville a dp xxx parody episode 2",
"pleasureville a dp xxx parody episode 4",
)
assert s == 1.0
def test_series_mismatch_intersection_is_no_mismatch() -> None:
# Oba mają {7} (Make'em Sweat #7) → BRAK mismatchu na pozycji,
# ale BTS asymmetry → 0.7
s = series_mismatch_strength("make'em sweat #7", "make'em sweat #7 bts")
assert s == pytest.approx(0.7)
def test_series_mismatch_partial_overlap_is_still_hard() -> None:
# "Volume 140 Scene 3" vs "Volume 140 Scene 4" — wspólny 140 ale różne 3/4,
# to są osobne sceny ze wspólnej kompilacji → hard split.
s = series_mismatch_strength(
"women seeking women volume 140 scene 3",
"women seeking women volume 140 scene 4",
)
assert s == 1.0
def test_series_mismatch_no_year_false_positive() -> None:
# "scene from 2020" nie może wygenerować fałszywej pozycji z roku.
pos = detect_series_positions("scene from 2020")
# Może tu być {2020}? Nie — \d{1,3} z anti-greedy boundary nie złapie 4-cyfr.
assert pos == set()
def test_series_mismatch_bts_asymmetric() -> None:
# Tytuły: Training Ravyn vs Training Ravyn (BTS - 1)
# pos: {} vs {1} → brak common pos ale jedna strona pusta → nie hard split
# BTS po jednej stronie → 0.7
s = series_mismatch_strength("training ravyn", "training ravyn (bts - 1)")
assert s == pytest.approx(0.7)
def test_series_mismatch_no_signal() -> None:
s = series_mismatch_strength("the great heist", "the great heist")
assert s == 0.0
def test_composite_series_position_hard_reject() -> None:
# Mimo wszystkich silnych sygnałów (fp/title/performers/date 1.0) — series mismatch
# 1.0 forsuje twardy reject. To gwarantuje że "Episode 2 vs Episode 4" z tym samym
# phashem (studio reuse cover art) NIE auto-mergeują.
score, reasons = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=1.0,
)
assert score == 0.0
assert reasons.get("series_position_mismatch")
def test_composite_series_modifier_cap_07() -> None:
# Modifier mismatch (BTS po jednej stronie) → cap = 1 - 0.7 = 0.3
score, reasons = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=0.7,
)
assert score == pytest.approx(0.3)
assert reasons.get("series_modifier_cap") == pytest.approx(0.3)
def test_composite_series_zero_no_effect() -> None:
score_a, _ = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=0.0,
)
score_b, _ = composite_score(
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
studio_match=True, series_mismatch=None,
)
assert score_a == score_b == pytest.approx(1.0)
# ---- triage -------------------------------------------------------------- # ---- triage --------------------------------------------------------------
def test_triage_thresholds() -> None: def test_triage_thresholds() -> None: