Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector
Mobile / OTA: - Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org - Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version) - backend.ts: default public backend auto-connect (no manual login) WebView fallback fix (PlayerScreen INJECTED_JS): - Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init) - Context-scoped: only clicks consent buttons inside cookie/gdpr containers - Retry window for <source>.src polling raised 5→15 ticks (post-dismiss init) Resolver: - Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited) → composite_score hard-reject / cap; wired into scene_score + bulk_dedup - aggregator-mode candidate query: LIMIT 500 + title-match ordering Connectors: - porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot landing: APK links → goon-v0.1.9.apk Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ad0284585b
commit
642f1ab8b8
36 changed files with 1825 additions and 100 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
|
@ -71,6 +71,13 @@ mcp-logs.txt
|
||||||
# ADB / development debug artefakty (screenshots, ui dumps)
|
# ADB / development debug artefakty (screenshots, ui dumps)
|
||||||
.tmp_adb/
|
.tmp_adb/
|
||||||
|
|
||||||
|
# Marketing screenshots — kept local, hosted externally for posts/landing.
|
||||||
|
# NOT committed: explicit thumbnails risk GitHub TOS takedown.
|
||||||
|
screenshots/
|
||||||
|
|
||||||
|
# Launch / marketing material — local working notes, not part of the codebase.
|
||||||
|
launch/
|
||||||
|
|
||||||
# Operational deploy scripts — moved to a private companion repo. Public repo
|
# Operational deploy scripts — moved to a private companion repo. Public repo
|
||||||
# should NOT contain SSH commands, systemd units, or smoke-test playbooks
|
# should NOT contain SSH commands, systemd units, or smoke-test playbooks
|
||||||
# referencing concrete hosts.
|
# referencing concrete hosts.
|
||||||
|
|
|
||||||
|
|
@ -184,21 +184,19 @@ def resolve_movie_playback(
|
||||||
pb.id,
|
pb.id,
|
||||||
)
|
)
|
||||||
stream = None
|
stream = None
|
||||||
|
# Mixdrop mxcontent CDN wymaga curl_cffi JA3 → wymusza VPS proxy.
|
||||||
|
# Pre-public: skip mixdrop direct, fallback na embed_url (mobile WebView z
|
||||||
|
# phone IP). Bandwidth + anonimowość VPS > UX. Movie ma zwykle 10+ alt
|
||||||
|
# hosterów (voe/luluvid/doply/etc.), user może wybrać alternative.
|
||||||
|
if stream and "mxcontent.net" in stream.lower():
|
||||||
|
log.info(
|
||||||
|
"movie playback %s: mixdrop mxcontent — skip (VPS-proxy required), WebView fallback",
|
||||||
|
pb.id,
|
||||||
|
)
|
||||||
|
stream = None
|
||||||
if stream:
|
if stream:
|
||||||
type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4"
|
type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4"
|
||||||
# Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop):
|
|
||||||
# proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True`
|
|
||||||
# idzie przez `raw` → `_proxify_link` ustawi token `i=1`.
|
|
||||||
cdn_needs_impersonate = "mxcontent.net" in stream.lower()
|
|
||||||
raw_meta: dict = {"origin": pb.origin, "host": target}
|
raw_meta: dict = {"origin": pb.origin, "host": target}
|
||||||
if cdn_needs_impersonate:
|
|
||||||
raw_meta["proxy_impersonate"] = True
|
|
||||||
# Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4.
|
|
||||||
# Backend extract zamknął sesję — proxy musi re-fetchować
|
|
||||||
# embed page w fresh curl_cffi session żeby re-extract mp4
|
|
||||||
# z aktualnymi cookies.
|
|
||||||
raw_meta["refetch_url"] = target
|
|
||||||
raw_meta["refetch_hoster"] = "mixdrop"
|
|
||||||
links.append(
|
links.append(
|
||||||
StreamLink(
|
StreamLink(
|
||||||
stream_url=stream,
|
stream_url=stream,
|
||||||
|
|
|
||||||
|
|
@ -72,13 +72,22 @@ class Settings(BaseSettings):
|
||||||
sched_movie_ingest_hours: int = Field(
|
sched_movie_ingest_hours: int = Field(
|
||||||
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
|
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
|
||||||
)
|
)
|
||||||
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie.
|
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes.
|
||||||
|
# 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po
|
||||||
|
# 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h
|
||||||
|
# cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno.
|
||||||
|
# 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik.
|
||||||
sched_browse_latest_hours: int = Field(
|
sched_browse_latest_hours: int = Field(
|
||||||
default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
|
default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
|
||||||
)
|
)
|
||||||
sched_browse_latest_max_pages: int = Field(
|
sched_browse_latest_max_pages: int = Field(
|
||||||
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
|
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
|
||||||
)
|
)
|
||||||
|
# Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time
|
||||||
|
# scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run).
|
||||||
|
sched_bulk_dedup_hours: int = Field(
|
||||||
|
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
|
||||||
|
)
|
||||||
|
|
||||||
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
|
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
|
||||||
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
|
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
|
||||||
|
|
|
||||||
|
|
@ -137,6 +137,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.**
|
# (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.**
|
||||||
from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402
|
from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
|
from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
|
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
|
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
|
||||||
|
|
||||||
|
|
@ -152,6 +153,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title
|
# 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title
|
||||||
# fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować.
|
# fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować.
|
||||||
Porn00Scraper,
|
Porn00Scraper,
|
||||||
|
# PornDoeScraper — dołączony 2026-05-21 (theporndude audit). Każda scena ma
|
||||||
|
# kompletny JSON-LD VideoObject: title + uploadDate + duration + named studio
|
||||||
|
# (producer/publisher) + named performers (actor[]) + thumbnail. Najbogatsze
|
||||||
|
# strukturalne metadane spośród browse scraperów — composite fuzzy match ma
|
||||||
|
# komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio +
|
||||||
|
# performer + date + duration nadrabiają.
|
||||||
|
PornDoeScraper,
|
||||||
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
||||||
# Follow-up: dorobić te tubey i sprawdzić phash distance:
|
# Follow-up: dorobić te tubey i sprawdzić phash distance:
|
||||||
# - fullmovies.xxx (channel/network/pornstars/categories, brak duration)
|
# - fullmovies.xxx (channel/network/pornstars/categories, brak duration)
|
||||||
|
|
|
||||||
|
|
@ -163,11 +163,25 @@ class FreshpornoScraper(BaseBrowseScraper):
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Release date — freshporno emituje `<meta itemprop="uploadDate" content="2026-05-20T...">`.
|
||||||
|
# To data wrzucenia na freshporno, NIE oryginalna release_date studio — ale dla
|
||||||
|
# świeżych scen (uploaded niedługo po publikacji) różnica ≤ 3-7 dni, mieści się w
|
||||||
|
# `date_window_days=7` w resolverze. Bez tego pola scene NULL → match score 0 →
|
||||||
|
# duplicate scene zamiast freshporno PS dodane do TPDB canonical (bug-report
|
||||||
|
# 2026-05-20: brak Brazzers Exxtra po 15-05).
|
||||||
|
release_date_parsed: date | None = None
|
||||||
|
if (m := re.search(r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', detail_html)):
|
||||||
|
try:
|
||||||
|
release_date_parsed = date.fromisoformat(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
return RawScene(
|
return RawScene(
|
||||||
external_id=f"{self.sitetag}:{scene_url}",
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
title=title,
|
title=title,
|
||||||
description=description,
|
description=description,
|
||||||
duration_sec=duration_sec,
|
duration_sec=duration_sec,
|
||||||
|
release_date=release_date_parsed,
|
||||||
url=scene_url,
|
url=scene_url,
|
||||||
studio=studio,
|
studio=studio,
|
||||||
performers=performers,
|
performers=performers,
|
||||||
|
|
|
||||||
271
app/connectors/direct_scrapers/porndoe.py
Normal file
271
app/connectors/direct_scrapers/porndoe.py
Normal file
|
|
@ -0,0 +1,271 @@
|
||||||
|
"""porndoe.com — latest-vids browse scraper.
|
||||||
|
|
||||||
|
Dołączony 2026-05-21 (theporndude audit). Jedyny verified high-value candidate
|
||||||
|
z 172 tube'ów na theporndude.com/top-porn-tube-sites + /full-porn-movies-sites.
|
||||||
|
|
||||||
|
Czemu wart: każda scena ma kompletny **JSON-LD VideoObject** schema:
|
||||||
|
- name (title), description, uploadDate (ISO timestamp), duration (ISO 8601)
|
||||||
|
- producer + publisher → named studio z `/channel-profile/<slug>` URL
|
||||||
|
- actor[] → named performers z `/pornstars-profile/<slug>` URL
|
||||||
|
- thumbnailUrl (CDN p.cdnc.porndoe.com)
|
||||||
|
|
||||||
|
To wystarczy do composite fuzzy match w resolverze (studio + performer Jaccard +
|
||||||
|
date proximity + title token-set + duration). Phash hit-rate niski (porndoe robi
|
||||||
|
własne crop-thumbnaile 390x219, nie hot-linkuje studio art) — ale rich metadata
|
||||||
|
nadrabia, jak pornxp/porn00.
|
||||||
|
|
||||||
|
URL patterns:
|
||||||
|
- Listing: `/videos/most-recent?page=N` (page 1 = newest, ~31 scen/page)
|
||||||
|
- Scene: `/watch/<id>` gdzie id = `pd` + 10 alfanum (stable)
|
||||||
|
- Studio: `/channel-profile/<slug>`
|
||||||
|
- Performer: `/pornstars-profile/<slug>`
|
||||||
|
- Tags/categories: `/categories/<slug>`
|
||||||
|
|
||||||
|
Playback: stream URL NIE jest inline w SSR HTML — player JS init dopiero po user
|
||||||
|
"Play" click. Dajemy playback_source z page_url + origin `tube:porndoecom`;
|
||||||
|
extractor w `_REGISTRY` mapuje na `_vps_blocked_fallback.extract` → mobile WebView
|
||||||
|
INJECTED_JS scrapuje `<video>.src` po phone IP (0 VPS bandwidth, zgodne z
|
||||||
|
pre-public bandwidth/anonimowość priorytet).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://porndoe.com"
|
||||||
|
|
||||||
|
# Scene listing — `<a href="/watch/pd7a3o0e8v2b">`. Id = `pd` + alfanum.
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(/watch/[a-z0-9]+)"', re.IGNORECASE)
|
||||||
|
_WATCH_ID_RE = re.compile(r"/watch/([a-z0-9]+)", re.IGNORECASE)
|
||||||
|
|
||||||
|
# JSON-LD <script> bloki.
|
||||||
|
_JSONLD_RE = re.compile(
|
||||||
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tagi/kategorie z DOM (JSON-LD genre bywa pusty). porndoe URL: `/category/<id>/<slug>`.
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'href="/category/\d+/([a-z0-9\-]+)"[^>]*>([^<]+)</a>', re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# ISO 8601 duration — porndoe emituje "PT8M0S" (czasem "T8M0S" bez P).
|
||||||
|
_ISO_DUR_RE = re.compile(
|
||||||
|
r"^P?T?(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$", re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_duration(value: str | None) -> int | None:
|
||||||
|
"""`PT11M7S` / `T8M0S` → sekundy. None gdy format nieznany."""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_RE.match(value.strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
h = int(m.group(1) or 0)
|
||||||
|
mn = int(m.group(2) or 0)
|
||||||
|
s = int(m.group(3) or 0)
|
||||||
|
total = h * 3600 + mn * 60 + s
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_date(value: str | None) -> date | None:
|
||||||
|
"""`2026-05-20T14:55:13+00:00` → date. None gdy parse fail."""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
# Fallback: pierwsze 10 znaków YYYY-MM-DD
|
||||||
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", value)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _slug_from_url(url: str | None) -> str | None:
|
||||||
|
"""`https://porndoe.com/channel-profile/fantasy-girl-pass` → `fantasy-girl-pass`."""
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
m = re.search(r"/(?:channel-profile|pornstars-profile)/([a-z0-9\-]+)", url, re.IGNORECASE)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_jsonld_objects(data: object):
|
||||||
|
"""Spłaszcza JSON-LD: dict / list / @graph → strumień dict-ów."""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
graph = data.get("@graph")
|
||||||
|
if isinstance(graph, list):
|
||||||
|
for item in graph:
|
||||||
|
yield from _iter_jsonld_objects(item)
|
||||||
|
else:
|
||||||
|
yield data
|
||||||
|
elif isinstance(data, list):
|
||||||
|
for item in data:
|
||||||
|
yield from _iter_jsonld_objects(item)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_video_object(html: str) -> dict | None:
|
||||||
|
"""Znajdź pierwszy JSON-LD VideoObject w HTML."""
|
||||||
|
for m in _JSONLD_RE.finditer(html):
|
||||||
|
raw = m.group(1).strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
for obj in _iter_jsonld_objects(data):
|
||||||
|
if obj.get("@type") == "VideoObject":
|
||||||
|
return obj
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class PornDoeScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "porndoecom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/videos/most-recent"
|
||||||
|
return f"{_BASE}/videos/most-recent?page={page}"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = f"{_BASE}{m.group(1)}"
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _extract_video_object(detail_html)
|
||||||
|
if not video:
|
||||||
|
log.info("porndoe: no JSON-LD VideoObject on %s", scene_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
title = (video.get("name") or "").strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
watch_id_m = _WATCH_ID_RE.search(scene_url)
|
||||||
|
watch_id = watch_id_m.group(1) if watch_id_m else None
|
||||||
|
|
||||||
|
description = (video.get("description") or "").strip() or None
|
||||||
|
duration_sec = _parse_iso_duration(video.get("duration"))
|
||||||
|
release_date = _parse_iso_date(
|
||||||
|
video.get("uploadDate") or video.get("datePublished")
|
||||||
|
)
|
||||||
|
thumbnail_url = video.get("thumbnailUrl") or None
|
||||||
|
|
||||||
|
# Studio: producer / publisher (Organization). Preferuj producer.
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
for key in ("producer", "publisher"):
|
||||||
|
org = video.get(key)
|
||||||
|
if isinstance(org, dict) and org.get("name"):
|
||||||
|
name = org["name"].strip()
|
||||||
|
slug = _slug_from_url(org.get("url")) or re.sub(
|
||||||
|
r"[^a-z0-9]+", "-", name.lower()
|
||||||
|
).strip("-")
|
||||||
|
if name:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:channel:{slug}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Performers: actor[] (lista Person lub pojedynczy Person).
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
actors = video.get("actor")
|
||||||
|
if isinstance(actors, dict):
|
||||||
|
actors = [actors]
|
||||||
|
if isinstance(actors, list):
|
||||||
|
for actor in actors:
|
||||||
|
if not isinstance(actor, dict):
|
||||||
|
continue
|
||||||
|
name = (actor.get("name") or "").strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
slug = _slug_from_url(actor.get("url")) or re.sub(
|
||||||
|
r"[^a-z0-9]+", "-", name.lower()
|
||||||
|
).strip("-")
|
||||||
|
if slug in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"{self.sitetag}:performer:{slug}",
|
||||||
|
name=name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tagi: z DOM (`/categories/<slug>` / `/tags/<slug>`).
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if not name or name.lower() in ("categories", "tags", ""):
|
||||||
|
continue
|
||||||
|
if slug in seen_tag or len(slug) > 60:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(
|
||||||
|
RawTag(external_id=f"{self.sitetag}:tag:{slug}", name=name, slug=slug)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phash z thumbnail (porndoe robi własne crop-thumbnaile — niski hit-rate
|
||||||
|
# oczekiwany, ale graceful: brak match → resolver spada do composite scoring).
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
# Playback — page_url do scena strony. Stream JS-rendered, więc extractor
|
||||||
|
# `porndoecom` → `_vps_blocked_fallback.extract` (mobile WebView scrape).
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{watch_id or scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
|
|
@ -50,12 +50,13 @@ log = logging.getLogger(__name__)
|
||||||
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
|
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
|
||||||
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
|
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
|
||||||
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# Custom (zoptymalizowane / niestandardowy player)
|
# hqporner — CDN URLs IP-bound do VPS, force_proxy wymusza ruch przez VPS proxy.
|
||||||
# hqporner — CDN URL (bigcdn.cc, video.flyflv.com z `ip=` parametrem) IP-bound do
|
# 2026-05-20 (pre-public): bandwidth + anonimowość VPS > UX. Switch na WebView
|
||||||
# requestera. VPS resolve daje 200 ale mobile direct = 404/403. Switch na WebView
|
# fallback — mobile pobiera embed iframe z phone IP, FluidPlayer JS decoduje
|
||||||
# fallback: mobile pobiera embed iframe (mydaddy.cc/hqwo.cc) z phone IP, FluidPlayer
|
# mp4, ExoPlayer odtwarza direct z phone CDN session. **0 VPS bandwidth + VPS
|
||||||
# JS decoduje mp4 URL z mobile session. Plus INJECTED_JS skanuje `<source>.src`.
|
# IP nie ujawniony** (mobile nie łączy się z VPS proxy URL).
|
||||||
# ~32k scen (drugi po porntrex największy single saving). Verified 2026-05-18.
|
# Trade-off: WebView ma 1 extra step (page → player JS) ale bez popup-ads jak
|
||||||
|
# hqporner.com bo INJECTED_JS w PlayerScreen.tsx blokuje + scrape `<source>.src`.
|
||||||
"hqpornercom": _vps_blocked_fallback.extract,
|
"hqpornercom": _vps_blocked_fallback.extract,
|
||||||
"epornercom": eporner.extract,
|
"epornercom": eporner.extract,
|
||||||
"sxyprncom": sxyprn.extract,
|
"sxyprncom": sxyprn.extract,
|
||||||
|
|
@ -94,13 +95,12 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
|
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
|
||||||
# follow_redirects=True wymagane (fix w stream_proxy.py).
|
# follow_redirects=True wymagane (fix w stream_proxy.py).
|
||||||
"pornhatcom": pornhat.extract,
|
"pornhatcom": pornhat.extract,
|
||||||
# Freshporno KVS — `cv=` HMAC signed token IP-bound. Server-side resolve dało
|
# Freshporno KVS — `cv=` HMAC signed token IP-bound do VPS. 2026-05-20 pre-public:
|
||||||
# 200 z VPS, ale laptop dostał 302+SSL error → token validate'uje requester IP.
|
# bandwidth + VPS anonimowość priorytet. WebView fallback → mobile pobiera embed
|
||||||
# Switch na WebView fallback: mobile pobiera embed page, KVS player decoduje
|
# z phone IP, KVS player JS decoduje video_url, ExoPlayer odtwarza direct z CDN.
|
||||||
# video_url w-page, ExoPlayer dostaje URL z phone session. ~15k scen.
|
|
||||||
"freshpornoorg": _vps_blocked_fallback.extract,
|
"freshpornoorg": _vps_blocked_fallback.extract,
|
||||||
# porn00 / pornxp — force_proxy=True wprost (IP-bound CDN). Switch na WebView
|
# porn00 / pornxp — IP-bound CDN tokens. Pre-public WebView fallback (bandwidth +
|
||||||
# fallback. Niski volume (84 scen), trivial saving ale konsystencja flow.
|
# anonimowość VPS). Niski volume (84 scen), trivial.
|
||||||
"porn00org": _vps_blocked_fallback.extract,
|
"porn00org": _vps_blocked_fallback.extract,
|
||||||
"pornxpph": _vps_blocked_fallback.extract,
|
"pornxpph": _vps_blocked_fallback.extract,
|
||||||
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
|
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
|
||||||
|
|
@ -114,6 +114,11 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
"perverzijacom": _embed_iframe.extract,
|
"perverzijacom": _embed_iframe.extract,
|
||||||
# Special: WebView-only (Yii2 session-bound player).
|
# Special: WebView-only (Yii2 session-bound player).
|
||||||
"paradisehillcc": paradisehill.extract,
|
"paradisehillcc": paradisehill.extract,
|
||||||
|
# PornDoe — dołączony 2026-05-21 (theporndude audit). Stream URL nie inline w
|
||||||
|
# SSR HTML (player JS init po Play click), więc WebView fallback: mobile pobiera
|
||||||
|
# /watch/<id> z phone IP, player JS dekoduje video.src, INJECTED_JS scrape.
|
||||||
|
# 0 VPS bandwidth — zgodne z pre-public bandwidth/anonimowość priorytet.
|
||||||
|
"porndoecom": _vps_blocked_fallback.extract,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,15 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
|
||||||
if basename in seen_keys:
|
if basename in seen_keys:
|
||||||
continue
|
continue
|
||||||
seen_keys.add(basename)
|
seen_keys.add(basename)
|
||||||
result.append(StreamSource(link=url, type="mp4", quality=quality))
|
# `force_proxy=True` (2026-05-20): freshporno get_file 302 → cdn4.freshporno.org
|
||||||
|
# IP-bound (cv= HMAC token). Mobile direct = 403/SSL fail → fallback proxy
|
||||||
|
# generuje "mrugnięcie" (user bug 743eefbf "najpierw strona potem video").
|
||||||
|
# Force_proxy wymusza mobile użycie proxied URL od razu — bez flickera +
|
||||||
|
# natywny ExoPlayer + quality picker zachowane.
|
||||||
|
result.append(StreamSource(
|
||||||
|
link=url, type="mp4", quality=quality,
|
||||||
|
raw={"force_proxy": True},
|
||||||
|
))
|
||||||
|
|
||||||
if not result:
|
if not result:
|
||||||
log.info("freshporno: no MP4 anchor matches on %s", page_url)
|
log.info("freshporno: no MP4 anchor matches on %s", page_url)
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,15 @@ def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | Non
|
||||||
continue
|
continue
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
title = (sm.group(2) or "").strip()
|
title = (sm.group(2) or "").strip()
|
||||||
sources.append(StreamSource(link=url, quality=title or None, type="mp4", referer=iframe_referer))
|
# `force_proxy=True` (2026-05-20): CDN-y bigcdn.cc/flyflv IP-bound + flyflv ma
|
||||||
|
# `ip=46.62.219.154` w URL path. Mobile direct = 404/403 → fallback proxy
|
||||||
|
# generuje flicker. Force_proxy wymusza mobile użycie proxied od razu.
|
||||||
|
# Bug-report e8ddd8d4: "kliknięcie otwiera reklamę" gdy _vps_blocked_fallback
|
||||||
|
# (hqporner page ads). Force_proxy + native mp4 = quality picker + natywny.
|
||||||
|
sources.append(StreamSource(
|
||||||
|
link=url, quality=title or None, type="mp4", referer=iframe_referer,
|
||||||
|
raw={"force_proxy": True},
|
||||||
|
))
|
||||||
|
|
||||||
if sources:
|
if sources:
|
||||||
return sources
|
return sources
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ def version() -> dict[str, str | None]:
|
||||||
# mobile sklei z baseUrl.
|
# mobile sklei z baseUrl.
|
||||||
public_url = os.environ.get("BACKEND_PUBLIC_URL", "").rstrip("/")
|
public_url = os.environ.get("BACKEND_PUBLIC_URL", "").rstrip("/")
|
||||||
apk_url = f"{public_url}/static/app-release.apk" if public_url else "/static/app-release.apk"
|
apk_url = f"{public_url}/static/app-release.apk" if public_url else "/static/app-release.apk"
|
||||||
return {"version": "0.1.8", "apk_url": apk_url}
|
return {"version": "0.1.9", "apk_url": apk_url}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/readyz")
|
@app.get("/readyz")
|
||||||
|
|
|
||||||
|
|
@ -123,14 +123,38 @@ def resolve_scene(
|
||||||
result = find_by_phash_within(session, phash=value)
|
result = find_by_phash_within(session, phash=value)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
scene_match, distance = result
|
scene_match, distance = result
|
||||||
score = 1.0 - distance / 64.0
|
raw_phash_score = 1.0 - distance / 64.0
|
||||||
# Duration sanity check: phash może collide gdy compilation zawiera chapter sceny
|
# Duration sanity check: phash może collide gdy compilation zawiera chapter sceny
|
||||||
# (oba mają ten sam frame sample), ale duration będzie wyraźnie inny.
|
# (oba mają ten sam frame sample), ale duration będzie wyraźnie inny.
|
||||||
# Wymagamy proximity ≥0.5 (±30s) dla auto-merge; inaczej → review queue.
|
# Wymagamy proximity ≥0.5 (±30s) dla auto-merge; inaczej → review queue.
|
||||||
from app.resolve.scoring import duration_proximity
|
from app.resolve.scoring import duration_proximity, series_mismatch_strength
|
||||||
dur_prox = duration_proximity(scene_match.duration_sec, norm.duration_sec)
|
dur_prox = duration_proximity(scene_match.duration_sec, norm.duration_sec)
|
||||||
|
|
||||||
|
# Series-position guard (Episode 2 vs Episode 4): phash zwykle pixel-identical
|
||||||
|
# bo studio reusuje cover art między episodami, ale to OSOBNE sceny. Hard split,
|
||||||
|
# bez merge_candidate (nie ma czego mergować — żaden human reviewer też nie
|
||||||
|
# powie "Episode 2 to to samo co Episode 4").
|
||||||
|
sp_strength = series_mismatch_strength(
|
||||||
|
scene_match.title_normalized, norm.title_normalized
|
||||||
|
)
|
||||||
|
if sp_strength >= 1.0:
|
||||||
|
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
|
||||||
|
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
|
||||||
|
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
|
||||||
|
return SceneResolveResult(
|
||||||
|
scene=new_scene,
|
||||||
|
was_created=True,
|
||||||
|
path="fp_phash_series_split",
|
||||||
|
score=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
if dur_prox is not None and dur_prox < 0.5:
|
if dur_prox is not None and dur_prox < 0.5:
|
||||||
# phash match ale duration rozjeżdża się → tworzymy nową scenę + review.
|
# phash match ale duration rozjeżdża się → tworzymy nową scenę + review.
|
||||||
|
# Score reflectuje że to NIE jest auto-merge: dur_prox * phash_score,
|
||||||
|
# plus dalej cap przez series modifier mismatch (BTS/bonus/unedited).
|
||||||
|
penalised_score = raw_phash_score * max(dur_prox, 0.1)
|
||||||
|
if 0.0 < sp_strength < 1.0:
|
||||||
|
penalised_score = min(penalised_score, 1.0 - sp_strength)
|
||||||
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
|
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
|
||||||
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
|
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
|
||||||
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
|
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
|
||||||
|
|
@ -139,11 +163,14 @@ def resolve_scene(
|
||||||
kind=MergeKind.scene,
|
kind=MergeKind.scene,
|
||||||
left_id=scene_match.id,
|
left_id=scene_match.id,
|
||||||
right_id=new_scene.id,
|
right_id=new_scene.id,
|
||||||
score=score,
|
score=penalised_score,
|
||||||
reasons={
|
reasons={
|
||||||
"path": "fp_phash",
|
"path": "fp_phash",
|
||||||
"hamming": distance,
|
"hamming": distance,
|
||||||
|
"phash_score": raw_phash_score,
|
||||||
"duration_mismatch": True,
|
"duration_mismatch": True,
|
||||||
|
"dur_prox": dur_prox,
|
||||||
|
"series_mismatch_strength": sp_strength,
|
||||||
"left_dur": scene_match.duration_sec,
|
"left_dur": scene_match.duration_sec,
|
||||||
"right_dur": norm.duration_sec,
|
"right_dur": norm.duration_sec,
|
||||||
},
|
},
|
||||||
|
|
@ -154,9 +181,42 @@ def resolve_scene(
|
||||||
scene=new_scene,
|
scene=new_scene,
|
||||||
was_created=True,
|
was_created=True,
|
||||||
path="fp_phash_review",
|
path="fp_phash_review",
|
||||||
score=score,
|
score=penalised_score,
|
||||||
candidate_id=scene_match.id,
|
candidate_id=scene_match.id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Modifier tag mismatch (BTS/bonus/unedited po jednej stronie) — nie hard-split,
|
||||||
|
# ale auto-merge zablokowane: tworzymy nową scenę + pending review.
|
||||||
|
if 0.0 < sp_strength < 1.0:
|
||||||
|
penalised_score = min(raw_phash_score, 1.0 - sp_strength)
|
||||||
|
new_scene = _create_canonical(session, norm=norm, studio_id=studio_id)
|
||||||
|
_attach_external_ref(session, scene_id=new_scene.id, source_id=source_id, norm=norm)
|
||||||
|
_sync_attached_entities(session, scene=new_scene, norm=norm, source_id=source_id)
|
||||||
|
session.add(
|
||||||
|
MergeCandidate(
|
||||||
|
kind=MergeKind.scene,
|
||||||
|
left_id=scene_match.id,
|
||||||
|
right_id=new_scene.id,
|
||||||
|
score=penalised_score,
|
||||||
|
reasons={
|
||||||
|
"path": "fp_phash",
|
||||||
|
"hamming": distance,
|
||||||
|
"phash_score": raw_phash_score,
|
||||||
|
"series_modifier_mismatch": True,
|
||||||
|
"series_mismatch_strength": sp_strength,
|
||||||
|
},
|
||||||
|
status=MergeStatus.pending,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return SceneResolveResult(
|
||||||
|
scene=new_scene,
|
||||||
|
was_created=True,
|
||||||
|
path="fp_phash_modifier_review",
|
||||||
|
score=penalised_score,
|
||||||
|
candidate_id=scene_match.id,
|
||||||
|
)
|
||||||
|
|
||||||
|
score = raw_phash_score
|
||||||
_update_scene_fields(scene_match, norm, studio_id=studio_id, source_kind=source_kind, session=session)
|
_update_scene_fields(scene_match, norm, studio_id=studio_id, source_kind=source_kind, session=session)
|
||||||
_attach_external_ref(session, scene_id=scene_match.id, source_id=source_id, norm=norm)
|
_attach_external_ref(session, scene_id=scene_match.id, source_id=source_id, norm=norm)
|
||||||
_sync_attached_entities(session, scene=scene_match, norm=norm, source_id=source_id)
|
_sync_attached_entities(session, scene=scene_match, norm=norm, source_id=source_id)
|
||||||
|
|
@ -215,14 +275,24 @@ def resolve_scene(
|
||||||
# które mają wspólny choć jeden performer z naszą sceną (mocny sygnał — performerzy
|
# które mają wspólny choć jeden performer z naszą sceną (mocny sygnał — performerzy
|
||||||
# to też nasz "blocking key" gdy studio i date są nieinformatywne).
|
# to też nasz "blocking key" gdy studio i date są nieinformatywne).
|
||||||
if aggregator_mode and performer_ids:
|
if aggregator_mode and performer_ids:
|
||||||
from sqlalchemy import distinct
|
# **2026-05-20 fix**: poprzednio LIMIT 50 BEZ ORDER BY → dla popular performera
|
||||||
|
# (Eveline Dellai z 200+ scen w bazie) prawdziwy match mógł być out of top-50,
|
||||||
|
# postgres zwracał arbitrary order → resolver nie widział kandydata → duplicate.
|
||||||
|
# Bug-report: brak Brazzers Exxtra po 15-05. Now: 500 limit + title-match priority
|
||||||
|
# ORDER, plus exact title match jako gwarantowany kandydat (CASE expression).
|
||||||
|
from sqlalchemy import case
|
||||||
|
title_match_expr = case(
|
||||||
|
(Scene.title_normalized == norm.title_normalized, 1),
|
||||||
|
else_=0,
|
||||||
|
)
|
||||||
more = (
|
more = (
|
||||||
session.execute(
|
session.execute(
|
||||||
select(Scene)
|
select(Scene)
|
||||||
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
|
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
|
||||||
.where(ScenePerformer.performer_id.in_(performer_ids))
|
.where(ScenePerformer.performer_id.in_(performer_ids))
|
||||||
.group_by(Scene.id)
|
.group_by(Scene.id)
|
||||||
.limit(50)
|
.order_by(title_match_expr.desc(), Scene.release_date.desc().nullslast())
|
||||||
|
.limit(500)
|
||||||
)
|
)
|
||||||
.scalars()
|
.scalars()
|
||||||
.all()
|
.all()
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ from app.resolve.scoring import (
|
||||||
duration_proximity,
|
duration_proximity,
|
||||||
performer_set_similarity,
|
performer_set_similarity,
|
||||||
phash_similarity,
|
phash_similarity,
|
||||||
|
series_mismatch_strength,
|
||||||
title_similarity,
|
title_similarity,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -49,6 +50,10 @@ def score_candidate(
|
||||||
else:
|
else:
|
||||||
studio_match = candidate.studio_id == studio_id
|
studio_match = candidate.studio_id == studio_id
|
||||||
|
|
||||||
|
series_mismatch = series_mismatch_strength(
|
||||||
|
candidate.title_normalized, norm.title_normalized
|
||||||
|
)
|
||||||
|
|
||||||
composite, reasons = composite_score(
|
composite, reasons = composite_score(
|
||||||
fp=fp,
|
fp=fp,
|
||||||
title=title,
|
title=title,
|
||||||
|
|
@ -57,6 +62,7 @@ def score_candidate(
|
||||||
duration_score=duration_score,
|
duration_score=duration_score,
|
||||||
studio_match=studio_match,
|
studio_match=studio_match,
|
||||||
aggregator_mode=aggregator_mode,
|
aggregator_mode=aggregator_mode,
|
||||||
|
series_mismatch=series_mismatch,
|
||||||
)
|
)
|
||||||
|
|
||||||
breakdown = ScoreBreakdown(
|
breakdown = ScoreBreakdown(
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ TPDB ma "Brazzers Exxtra" a StashDB "Brazzers" jako studio sceny).
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
@ -105,6 +106,91 @@ def date_proximity(left: date | None, right: date | None, *, window_days: int =
|
||||||
return 1.0 - delta / window_days
|
return 1.0 - delta / window_days
|
||||||
|
|
||||||
|
|
||||||
|
# Wyłapuje "Episode 4" / "Ep 4" / "Part 2" / "Pt. 3" / "Vol 7" / "Volume 12" /
|
||||||
|
# "Scene 5" / "Chapter 9" / "Ch.3" / "#7" / "S9:E8" / "S9E8" — wszystko po
|
||||||
|
# normalizacji (lower-cased, punkt usunięty zwykle, ale tolerujemy \\.).
|
||||||
|
# `(?<!\d)` + `(?!\d)` zapobiega wyłapaniu fragmentu cyfry z dłuższego ciągu —
|
||||||
|
# np. "scene from 2020" nie wygeneruje fałszywego pos=0 z boundary-end-of-2020.
|
||||||
|
_SERIES_NUM_RE = re.compile(
|
||||||
|
r"\b(?:episode|ep|part|pt|vol|volume|chapter|ch|scene|series)\b\s*\.?\s*#?\s*(?<!\d)(\d{1,3})(?!\d)"
|
||||||
|
r"|(?<!\w)#\s*(?<!\d)(\d{1,3})(?!\d)"
|
||||||
|
r"|\bs(?<!\d)(\d{1,2})(?!\d)\s*[:e]\s*e?(?<!\d)(\d{1,3})(?!\d)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tagi które wprost mówią że scena to wariant osobny (BTS / bonus / unedited /
|
||||||
|
# trailer). Jeśli tag jest TYLKO po jednej stronie, to NIE jest ta sama scena.
|
||||||
|
_MODIFIER_TAGS: tuple[str, ...] = (
|
||||||
|
"behind the scenes",
|
||||||
|
"behind-the-scenes",
|
||||||
|
"bts",
|
||||||
|
"bonus",
|
||||||
|
"unedited",
|
||||||
|
"uncut",
|
||||||
|
"extended",
|
||||||
|
"directors cut",
|
||||||
|
"director's cut",
|
||||||
|
"trailer",
|
||||||
|
"preview",
|
||||||
|
"teaser",
|
||||||
|
"compilation",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_series_positions(title_normalized: str | None) -> set[int]:
|
||||||
|
"""Zwraca wszystkie pozycje (Episode/Part/Vol/Scene/Chapter/# itp.) znalezione w tytule.
|
||||||
|
|
||||||
|
Tytuł powinien być znormalizowany (lowercase, unaccent), ale regex jest case-insensitive
|
||||||
|
i tolerancyjny — chodzi tylko o sygnał, nie o robust parsing.
|
||||||
|
"""
|
||||||
|
if not title_normalized:
|
||||||
|
return set()
|
||||||
|
out: set[int] = set()
|
||||||
|
for m in _SERIES_NUM_RE.finditer(title_normalized):
|
||||||
|
for g in m.groups():
|
||||||
|
if g and g.isdigit():
|
||||||
|
out.add(int(g))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def detect_modifier_tags(title_normalized: str | None) -> set[str]:
|
||||||
|
"""Zwraca set modifier tagów wykrytych w tytule (bts/bonus/unedited/itp.)."""
|
||||||
|
if not title_normalized:
|
||||||
|
return set()
|
||||||
|
lower = title_normalized.lower()
|
||||||
|
return {t for t in _MODIFIER_TAGS if t in lower}
|
||||||
|
|
||||||
|
|
||||||
|
def series_mismatch_strength(
|
||||||
|
title_a_normalized: str | None,
|
||||||
|
title_b_normalized: str | None,
|
||||||
|
) -> float:
|
||||||
|
"""Wykrywa rozjazd "wariantu sceny" między tytułami.
|
||||||
|
|
||||||
|
Zwraca strength w [0.0, 1.0]:
|
||||||
|
0.0 — brak sygnału mismatchu (tytuły kompatybilne).
|
||||||
|
0.5 — modifier tags po obu stronach ale RÓŻNE (BTS vs trailer).
|
||||||
|
0.7 — modifier tag po jednej stronie tylko (BTS vs regular).
|
||||||
|
1.0 — series position mismatch (Episode 2 vs Episode 4 → twardy reject).
|
||||||
|
"""
|
||||||
|
pos_a = detect_series_positions(title_a_normalized)
|
||||||
|
pos_b = detect_series_positions(title_b_normalized)
|
||||||
|
# Hard mismatch gdy oba mają jakieś pozycje i symmetric difference jest niepusty
|
||||||
|
# — przykład: "Vol 140 Scene 3" vs "Vol 140 Scene 4" mają wspólne 140 ale różne 3/4,
|
||||||
|
# to są osobne sceny ze wspólnej kompilacji. Asymetryczny brak (jedna strona ma
|
||||||
|
# pozycję a druga nie) nie liczy się jako mismatch — tube SEO często gubi numer.
|
||||||
|
if pos_a and pos_b and (pos_a ^ pos_b):
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
mod_a = detect_modifier_tags(title_a_normalized)
|
||||||
|
mod_b = detect_modifier_tags(title_b_normalized)
|
||||||
|
if (not mod_a) != (not mod_b):
|
||||||
|
return 0.7
|
||||||
|
if mod_a and mod_b and not (mod_a & mod_b):
|
||||||
|
return 0.5
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def duration_proximity(
|
def duration_proximity(
|
||||||
left: int | None, right: int | None, *, window_sec: int = 60
|
left: int | None, right: int | None, *, window_sec: int = 60
|
||||||
) -> float | None:
|
) -> float | None:
|
||||||
|
|
@ -145,6 +231,7 @@ def composite_score(
|
||||||
duration_score: float | None = None,
|
duration_score: float | None = None,
|
||||||
studio_match: bool | None,
|
studio_match: bool | None,
|
||||||
aggregator_mode: bool = False,
|
aggregator_mode: bool = False,
|
||||||
|
series_mismatch: float | None = None,
|
||||||
) -> tuple[float, dict]:
|
) -> tuple[float, dict]:
|
||||||
"""Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.
|
"""Łączy sub-score'y w jeden composite [0, 1] + zwraca raport reasons.
|
||||||
|
|
||||||
|
|
@ -153,9 +240,17 @@ def composite_score(
|
||||||
- aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
|
- aggregator_mode=True (np. tube'y typu HQPorner agregują z różnych studiów,
|
||||||
więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
|
więc studio z naszej perspektywy nie jest informatywny — pomijamy hard reject
|
||||||
i zwiększamy wagę performers).
|
i zwiększamy wagę performers).
|
||||||
|
|
||||||
|
`series_mismatch` (≥0.0): wartość z `series_mismatch_strength()` — gdy 1.0 (Episode 2
|
||||||
|
vs Episode 4), wymusza twardy reject niezależnie od pozostałych sygnałów; gdy 0.5-0.7
|
||||||
|
(modifier mismatch: BTS/bonus/unedited po jednej stronie), nakłada cap = `1 - strength`.
|
||||||
"""
|
"""
|
||||||
reasons: dict = {}
|
reasons: dict = {}
|
||||||
|
|
||||||
|
if series_mismatch is not None and series_mismatch >= 1.0:
|
||||||
|
reasons["series_position_mismatch"] = True
|
||||||
|
return 0.0, reasons
|
||||||
|
|
||||||
if studio_match is False:
|
if studio_match is False:
|
||||||
if fp is not None and fp >= 0.95:
|
if fp is not None and fp >= 0.95:
|
||||||
reasons["studio_mismatch_overridden_by_fp"] = True
|
reasons["studio_mismatch_overridden_by_fp"] = True
|
||||||
|
|
@ -257,6 +352,16 @@ def composite_score(
|
||||||
reasons["duration_perf_strong_match_bump"] = True
|
reasons["duration_perf_strong_match_bump"] = True
|
||||||
score = max(score, 0.92)
|
score = max(score, 0.92)
|
||||||
|
|
||||||
|
# Series-modifier cap: jedna ze stron ma "BTS"/"bonus"/"unedited" a druga nie,
|
||||||
|
# albo różne tagi. Twardy mismatch (różne pozycje numeryczne) został już złapany
|
||||||
|
# wcześniej (return 0.0). Tu zostają miękkie sygnały — cap żeby nigdy nie auto-merge.
|
||||||
|
if series_mismatch is not None and 0.0 < series_mismatch < 1.0:
|
||||||
|
cap = max(0.0, 1.0 - series_mismatch)
|
||||||
|
if score > cap:
|
||||||
|
reasons["series_modifier_cap"] = cap
|
||||||
|
reasons["series_mismatch_strength"] = series_mismatch
|
||||||
|
score = cap
|
||||||
|
|
||||||
return _clamp(score), reasons
|
return _clamp(score), reasons
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ from app.resolve.scoring import (
|
||||||
hamming_distance_hex,
|
hamming_distance_hex,
|
||||||
performer_set_similarity,
|
performer_set_similarity,
|
||||||
phash_similarity,
|
phash_similarity,
|
||||||
|
series_mismatch_strength,
|
||||||
title_similarity,
|
title_similarity,
|
||||||
triage,
|
triage,
|
||||||
)
|
)
|
||||||
|
|
@ -121,6 +122,8 @@ def score_scene_pair(session: Session, a: Scene, b: Scene) -> ScoreBreakdown:
|
||||||
else:
|
else:
|
||||||
studio_match = a.studio_id == b.studio_id
|
studio_match = a.studio_id == b.studio_id
|
||||||
|
|
||||||
|
series_mismatch = series_mismatch_strength(a.title_normalized, b.title_normalized)
|
||||||
|
|
||||||
# Bulk dedup nie jest aggregator — porównujemy dwie kanoniczne sceny, studio
|
# Bulk dedup nie jest aggregator — porównujemy dwie kanoniczne sceny, studio
|
||||||
# to prawdziwe studio. Aggregator mode tylko w resolverze przy ingest z tube'a.
|
# to prawdziwe studio. Aggregator mode tylko w resolverze przy ingest z tube'a.
|
||||||
composite, reasons = composite_score(
|
composite, reasons = composite_score(
|
||||||
|
|
@ -131,6 +134,7 @@ def score_scene_pair(session: Session, a: Scene, b: Scene) -> ScoreBreakdown:
|
||||||
duration_score=duration_score,
|
duration_score=duration_score,
|
||||||
studio_match=studio_match,
|
studio_match=studio_match,
|
||||||
aggregator_mode=False,
|
aggregator_mode=False,
|
||||||
|
series_mismatch=series_mismatch,
|
||||||
)
|
)
|
||||||
|
|
||||||
return ScoreBreakdown(
|
return ScoreBreakdown(
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,29 @@ def _job_movie_ingest() -> None:
|
||||||
log.exception("[scheduler] movie ingest %s failed", name)
|
log.exception("[scheduler] movie ingest %s failed", name)
|
||||||
|
|
||||||
|
|
||||||
|
def _job_bulk_dedup_performers() -> None:
|
||||||
|
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
|
||||||
|
resolver-time scoring nie złapał.
|
||||||
|
|
||||||
|
Use case (bug-report 2026-05-20, "brak Brazzers Exxtra po 15-05"):
|
||||||
|
freshporno scrape przed fixem release_date tworzył duplicate scenes zamiast
|
||||||
|
PS-merge do canonical TPDB scen. Resolver scoring miał score >0.92 (auto)
|
||||||
|
z release_date, ale BEZ release_date wagi się przesuwały i wpadało w review/new.
|
||||||
|
|
||||||
|
Bulk_dedup performers strategy iteruje per performer, robi pair-wise scoring
|
||||||
|
dla wszystkich scen tego performera — łapie duplicate-y które ingest-time
|
||||||
|
resolver pominął (np. gdy 2 sceny tej samej title+performer ale różny release_date).
|
||||||
|
Auto-merge gdy score≥0.92, pending merge_candidate gdy 0.75-0.92.
|
||||||
|
"""
|
||||||
|
log.info("[scheduler] bulk_dedup performers starting")
|
||||||
|
try:
|
||||||
|
from app.scheduler.bulk_dedup import run_bulk_dedup
|
||||||
|
bc = run_bulk_dedup(strategy="performers", dry_run=False)
|
||||||
|
log.info("[scheduler] bulk_dedup performers done: %s", bc)
|
||||||
|
except Exception:
|
||||||
|
log.exception("[scheduler] bulk_dedup performers failed")
|
||||||
|
|
||||||
|
|
||||||
def _job_performer_continuous(refresh_after_days: int) -> None:
|
def _job_performer_continuous(refresh_after_days: int) -> None:
|
||||||
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
|
"""Continuous worker — 1 performer per tick, ORDER BY last_searched_at NULLS FIRST.
|
||||||
|
|
||||||
|
|
@ -174,6 +197,17 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
||||||
cfg["browse_latest_hours"], max_pages,
|
cfg["browse_latest_hours"], max_pages,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cfg.get("bulk_dedup_hours"):
|
||||||
|
sched.add_job(
|
||||||
|
_job_bulk_dedup_performers,
|
||||||
|
IntervalTrigger(hours=cfg["bulk_dedup_hours"]),
|
||||||
|
id="bulk_dedup_performers",
|
||||||
|
replace_existing=True,
|
||||||
|
max_instances=1,
|
||||||
|
coalesce=True,
|
||||||
|
)
|
||||||
|
log.info("scheduler: bulk-dedup performers every %dh", cfg["bulk_dedup_hours"])
|
||||||
|
|
||||||
if cfg.get("movie_ingest_hours"):
|
if cfg.get("movie_ingest_hours"):
|
||||||
sched.add_job(
|
sched.add_job(
|
||||||
_job_movie_ingest,
|
_job_movie_ingest,
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,8 @@ from app.ingest import (
|
||||||
)
|
)
|
||||||
from app.models.ingest_run import IngestRun, IngestStatus
|
from app.models.ingest_run import IngestRun, IngestStatus
|
||||||
from app.models.performer import Performer, PerformerExternalRef
|
from app.models.performer import Performer, PerformerExternalRef
|
||||||
from app.models.scene import ScenePerformer
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import Scene, ScenePerformer
|
||||||
from app.models.source import Source, SourceKind
|
from app.models.source import Source, SourceKind
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
@ -463,14 +464,19 @@ def _claim_next_for_search(
|
||||||
"""Wybiera 1 performera z queue + UPDATE last_searched_at = now() w jednej
|
"""Wybiera 1 performera z queue + UPDATE last_searched_at = now() w jednej
|
||||||
transakcji (skip locked → safe pod konkurencyjnym workerze).
|
transakcji (skip locked → safe pod konkurencyjnym workerze).
|
||||||
|
|
||||||
Queue:
|
Queue priority (2026-05-20 update — orphan-rescue bias):
|
||||||
1. Performerzy NIGDY niesearchowani (last_searched_at IS NULL)
|
1. **Performerzy z RECENT scenes-without-playback** (last 7d, no live PS) —
|
||||||
2. Performerzy searchowani > `refresh_after` temu
|
najpilniejsi bo user widzi puste studio listings dla najnowszych scen.
|
||||||
3. Filtruj scene_count >= min_scene_count (eliminuje noise/false performerów)
|
Bug-report 2026-05-20: "brak Brazzers Exxtra po 15-05" → wszystkie nowe
|
||||||
4. Order: NULLS FIRST, potem najstarsze last_searched_at
|
TPDB sceny mają canonical metadata ale 0 playback bo continuous queue
|
||||||
|
nigdy ich nie dotyka (78k performers, 67k NULL → ~232 dni sweep).
|
||||||
|
2. Performerzy NIGDY niesearchowani (`last_searched_at IS NULL`)
|
||||||
|
3. Performerzy searchowani > `refresh_after` temu
|
||||||
|
4. Filtruj scene_count >= min_scene_count
|
||||||
"""
|
"""
|
||||||
cutoff = datetime.now(UTC) - refresh_after
|
cutoff = datetime.now(UTC) - refresh_after
|
||||||
# Subquery scene_count
|
orphan_cutoff = datetime.now(UTC) - timedelta(days=7)
|
||||||
|
|
||||||
sc_sub = (
|
sc_sub = (
|
||||||
select(
|
select(
|
||||||
ScenePerformer.performer_id.label("pid"),
|
ScenePerformer.performer_id.label("pid"),
|
||||||
|
|
@ -480,19 +486,41 @@ def _claim_next_for_search(
|
||||||
.subquery()
|
.subquery()
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: nie używamy FOR UPDATE bo PostgreSQL nie pozwala na to z GROUP BY
|
# Orphan-scene count per performer: scenes z release_date w ostatnich 7d
|
||||||
# subquery (scene_count agg). APScheduler max_instances=1 gwarantuje że tylko
|
# AND brak żywego playback source. Wysoki count = performer-z-rekordów-pustych.
|
||||||
# jeden tick runa się na raz, więc race nie jest realny.
|
orphan_sub = (
|
||||||
|
select(
|
||||||
|
ScenePerformer.performer_id.label("pid"),
|
||||||
|
func.count(ScenePerformer.scene_id).label("orphan_count"),
|
||||||
|
)
|
||||||
|
.join(Scene, Scene.id == ScenePerformer.scene_id)
|
||||||
|
.where(Scene.release_date > orphan_cutoff)
|
||||||
|
.where(
|
||||||
|
~select(PlaybackSource.id)
|
||||||
|
.where(PlaybackSource.scene_id == Scene.id)
|
||||||
|
.where(PlaybackSource.dead_at.is_(None))
|
||||||
|
.exists()
|
||||||
|
)
|
||||||
|
.group_by(ScenePerformer.performer_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
|
||||||
row = session.execute(
|
row = session.execute(
|
||||||
select(Performer)
|
select(Performer)
|
||||||
.join(sc_sub, sc_sub.c.pid == Performer.id, isouter=False)
|
.join(sc_sub, sc_sub.c.pid == Performer.id, isouter=False)
|
||||||
|
.join(orphan_sub, orphan_sub.c.pid == Performer.id, isouter=True)
|
||||||
.where(sc_sub.c.scene_count >= min_scene_count)
|
.where(sc_sub.c.scene_count >= min_scene_count)
|
||||||
.where(
|
.where(
|
||||||
(Performer.last_searched_at.is_(None))
|
(Performer.last_searched_at.is_(None))
|
||||||
| (Performer.last_searched_at < cutoff)
|
| (Performer.last_searched_at < cutoff)
|
||||||
)
|
)
|
||||||
.order_by(
|
.order_by(
|
||||||
|
# 1. Orphan scenes (last 7d, no playback) FIRST — desc count.
|
||||||
|
# COALESCE 0 sprawia że performerzy bez orphan idą za tymi z.
|
||||||
|
func.coalesce(orphan_sub.c.orphan_count, 0).desc(),
|
||||||
|
# 2. NULL last_searched_at next
|
||||||
Performer.last_searched_at.asc().nullsfirst(),
|
Performer.last_searched_at.asc().nullsfirst(),
|
||||||
|
# 3. Highest scene_count (popular performers earlier)
|
||||||
sc_sub.c.scene_count.desc(),
|
sc_sub.c.scene_count.desc(),
|
||||||
)
|
)
|
||||||
.limit(1)
|
.limit(1)
|
||||||
|
|
|
||||||
|
|
@ -161,6 +161,9 @@ def run_forever() -> int:
|
||||||
# to bulk import jednorazowy). Bug-report 93d3c485 (2026-05-19) "brak freshporno".
|
# to bulk import jednorazowy). Bug-report 93d3c485 (2026-05-19) "brak freshporno".
|
||||||
"browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 24) or None,
|
"browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 24) or None,
|
||||||
"browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5),
|
"browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5),
|
||||||
|
# Bulk-dedup performers — safety net dla duplikatów które resolver
|
||||||
|
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
|
||||||
|
"bulk_dedup_hours": getattr(settings, "sched_bulk_dedup_hours", 12) or None,
|
||||||
}
|
}
|
||||||
sched = build_scheduler(cfg)
|
sched = build_scheduler(cfg)
|
||||||
log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))
|
log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))
|
||||||
|
|
|
||||||
|
|
@ -83,18 +83,14 @@
|
||||||
<p class="text-lg md:text-xl text-gray-400 max-w-2xl leading-relaxed mb-10">
|
<p class="text-lg md:text-xl text-gray-400 max-w-2xl leading-relaxed mb-10">
|
||||||
Goon indexes scene metadata from TPDB & StashDB, deduplicates across
|
Goon indexes scene metadata from TPDB & StashDB, deduplicates across
|
||||||
30+ public tubes, and serves a fast mobile client. Zero ads. Zero tracking.
|
30+ public tubes, and serves a fast mobile client. Zero ads. Zero tracking.
|
||||||
Your data stays on your VPS.
|
Download, open, browse — no account, no setup.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<div class="flex flex-wrap gap-3">
|
<div class="flex flex-wrap gap-3">
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon/releases/latest"
|
<a href="/goon-v0.1.9.apk"
|
||||||
class="px-6 py-4 rounded-xl bg-accent text-white font-bold hover:bg-accentDeep transition glow">
|
class="px-6 py-4 rounded-xl bg-accent text-white font-bold hover:bg-accentDeep transition glow">
|
||||||
Download APK
|
Download APK
|
||||||
</a>
|
</a>
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon"
|
|
||||||
class="px-6 py-4 rounded-xl bg-bgElevated border border-border text-gray-200 font-semibold hover:border-accent transition">
|
|
||||||
View source on GitHub
|
|
||||||
</a>
|
|
||||||
<a href="#donate"
|
<a href="#donate"
|
||||||
class="px-6 py-4 rounded-xl bg-transparent border border-border text-gray-400 font-semibold hover:text-accent hover:border-accent transition">
|
class="px-6 py-4 rounded-xl bg-transparent border border-border text-gray-400 font-semibold hover:text-accent hover:border-accent transition">
|
||||||
♥ Support project
|
♥ Support project
|
||||||
|
|
@ -102,7 +98,7 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p class="text-xs text-gray-500 mt-6 font-mono">
|
<p class="text-xs text-gray-500 mt-6 font-mono">
|
||||||
Android only · self-hosted backend required · 18+
|
Android 7+ · no setup, no login · 18+
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
@ -165,11 +161,10 @@
|
||||||
|
|
||||||
<div class="card-hover bg-card border border-border rounded-2xl p-6">
|
<div class="card-hover bg-card border border-border rounded-2xl p-6">
|
||||||
<div class="text-accent text-2xl mb-3">⌬</div>
|
<div class="text-accent text-2xl mb-3">⌬</div>
|
||||||
<h3 class="text-lg font-bold mb-2">100% self-hosted</h3>
|
<h3 class="text-lg font-bold mb-2">Works out of the box</h3>
|
||||||
<p class="text-sm text-gray-400 leading-relaxed">
|
<p class="text-sm text-gray-400 leading-relaxed">
|
||||||
One <code class="text-accent font-mono text-xs">docker compose up -d</code>
|
Download the APK and it connects automatically — no account, no
|
||||||
and you own the API, the DB, the worker. No SaaS dependencies.
|
config. Power users can point it at their own self-hosted backend.
|
||||||
Your search history is yours.
|
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -200,26 +195,33 @@
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<!-- QUICK START -->
|
<!-- GET STARTED -->
|
||||||
<section class="px-6 py-20 bg-bgElevated/30 border-y border-border">
|
<section class="px-6 py-20 bg-bgElevated/30 border-y border-border">
|
||||||
<div class="max-w-3xl mx-auto">
|
<div class="max-w-3xl mx-auto">
|
||||||
<h2 class="text-3xl font-extrabold mb-3 tracking-tight">Quick start</h2>
|
<h2 class="text-3xl font-extrabold mb-3 tracking-tight">Get started</h2>
|
||||||
<p class="text-gray-500 mb-8">5 commands. Backend runs in 30 seconds on any Docker host.</p>
|
<p class="text-gray-500 mb-8">Three steps. No account, no server, no config.</p>
|
||||||
|
|
||||||
<div class="bg-card border border-border rounded-2xl p-6 font-mono text-sm leading-relaxed">
|
<ol class="space-y-4">
|
||||||
<div class="text-gray-500">$ <span class="text-gray-300">git clone https://github.com/REPLACE_PERSONA/goon.git</span></div>
|
<li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
|
||||||
<div class="text-gray-500">$ <span class="text-gray-300">cd goon && cp .env.example .env</span></div>
|
<span class="text-accent font-extrabold text-xl">1</span>
|
||||||
<div class="text-gray-500">$ <span class="text-gray-300"># edit .env: set TPDB_API_TOKEN, STASHDB_API_KEY, API_KEYS</span></div>
|
<span class="text-sm text-gray-300 leading-relaxed">
|
||||||
<div class="text-gray-500">$ <span class="text-gray-300">docker compose up -d</span></div>
|
<a href="/goon-v0.1.9.apk" class="text-accent font-bold hover:underline">Download the APK</a>
|
||||||
<div class="text-gray-500">$ <span class="text-gray-300">curl localhost:8000/health</span></div>
|
and open it. Allow "install from unknown sources" for your browser if Android asks.
|
||||||
<div class="text-good text-xs mt-3">{"status":"ok"}</div>
|
</span>
|
||||||
</div>
|
</li>
|
||||||
|
<li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
|
||||||
<p class="text-sm text-gray-500 mt-6">
|
<span class="text-accent font-extrabold text-xl">2</span>
|
||||||
Then download the APK above, point it at your backend, paste an API key.
|
<span class="text-sm text-gray-300 leading-relaxed">
|
||||||
Full docs in the
|
Open the app, accept the 18+ gate. It connects automatically — no login.
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon#readme" class="text-accent hover:underline">README</a>.
|
</span>
|
||||||
</p>
|
</li>
|
||||||
|
<li class="bg-card border border-border rounded-2xl p-5 flex gap-4">
|
||||||
|
<span class="text-accent font-extrabold text-xl">3</span>
|
||||||
|
<span class="text-sm text-gray-300 leading-relaxed">
|
||||||
|
Browse. That's it.
|
||||||
|
</span>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
|
@ -259,10 +261,7 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p class="text-xs text-gray-500 mt-6">
|
<p class="text-xs text-gray-500 mt-6">
|
||||||
Addresses are hard-coded in
|
Addresses + QR codes are shown in the app under Scenes » ♥.
|
||||||
<code class="font-mono text-accent">mobile/src/lib/donate.ts</code>
|
|
||||||
and shown in the app under Scenes » ♥. Always verify on-screen
|
|
||||||
against the repo before sending.
|
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
@ -275,19 +274,18 @@
|
||||||
<div class="w-2 h-2 rounded-full bg-accent"></div>
|
<div class="w-2 h-2 rounded-full bg-accent"></div>
|
||||||
<span class="font-bold tracking-widest uppercase">goon</span>
|
<span class="font-bold tracking-widest uppercase">goon</span>
|
||||||
</div>
|
</div>
|
||||||
<p>Self-hosted adult content metadata aggregator.</p>
|
<p>Adult content metadata aggregator. FOSS, ad-free.</p>
|
||||||
<p>MIT license. No warranty. 18+ jurisdictions only.</p>
|
<p>MIT license. No warranty. 18+ jurisdictions only.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex flex-col gap-1 text-right">
|
<div class="flex flex-col gap-1 text-right">
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon" class="hover:text-accent transition">GitHub</a>
|
<a href="/goon-v0.1.9.apk" class="hover:text-accent transition">Download APK</a>
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon/releases" class="hover:text-accent transition">Releases</a>
|
<a href="#donate" class="hover:text-accent transition">Support</a>
|
||||||
<a href="https://github.com/REPLACE_PERSONA/goon#readme" class="hover:text-accent transition">Docs</a>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<p class="max-w-5xl mx-auto mt-6 text-[10px] text-gray-600 leading-relaxed">
|
<p class="max-w-5xl mx-auto mt-6 text-[10px] text-gray-600 leading-relaxed">
|
||||||
Goon does not host, transcode, store, or distribute any media. It scrapes
|
Goon does not host, transcode, store, or distribute any media. It scrapes
|
||||||
publicly-available metadata and links out to the source. Operators are
|
publicly-available metadata and links out to the source. Users are
|
||||||
responsible for complying with local law. See README » Disclaimer.
|
responsible for complying with local law.
|
||||||
</p>
|
</p>
|
||||||
</footer>
|
</footer>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ import { ClientProvider } from './src/ClientContext';
|
||||||
import { ErrorBoundary } from './src/ErrorBoundary';
|
import { ErrorBoundary } from './src/ErrorBoundary';
|
||||||
import { isAccepted as isAgeGateAccepted } from './src/lib/agegate';
|
import { isAccepted as isAgeGateAccepted } from './src/lib/agegate';
|
||||||
import { APP_VERSION } from './src/lib/appVersion';
|
import { APP_VERSION } from './src/lib/appVersion';
|
||||||
|
import { DEFAULT_API_KEY, DEFAULT_BACKEND_URL } from './src/lib/backend';
|
||||||
import { getSettings as getLockSettings } from './src/lib/applock';
|
import { getSettings as getLockSettings } from './src/lib/applock';
|
||||||
import { AppNavigator } from './src/navigation';
|
import { AppNavigator } from './src/navigation';
|
||||||
import { AgeGateScreen } from './src/screens/AgeGateScreen';
|
import { AgeGateScreen } from './src/screens/AgeGateScreen';
|
||||||
|
|
@ -89,7 +90,13 @@ export default function App() {
|
||||||
const accepted = await isAgeGateAccepted();
|
const accepted = await isAgeGateAccepted();
|
||||||
setAgeAccepted(accepted);
|
setAgeAccepted(accepted);
|
||||||
const creds = await loadCredentials();
|
const creds = await loadCredentials();
|
||||||
if (creds) setClient(new GoonClient(creds.baseUrl, creds.apiKey));
|
if (creds) {
|
||||||
|
setClient(new GoonClient(creds.baseUrl, creds.apiKey));
|
||||||
|
} else {
|
||||||
|
// No stored credentials → auto-connect to the public instance.
|
||||||
|
// LoginScreen only appears after an explicit "Sign out".
|
||||||
|
setClient(new GoonClient(DEFAULT_BACKEND_URL, DEFAULT_API_KEY));
|
||||||
|
}
|
||||||
const lockSettings = await getLockSettings();
|
const lockSettings = await getLockSettings();
|
||||||
if (lockSettings.enabled && lockSettings.hasPin) {
|
if (lockSettings.enabled && lockSettings.hasPin) {
|
||||||
setLocked(true);
|
setLocked(true);
|
||||||
|
|
|
||||||
|
|
@ -93,8 +93,8 @@ android {
|
||||||
applicationId 'com.goon.mobile'
|
applicationId 'com.goon.mobile'
|
||||||
minSdkVersion rootProject.ext.minSdkVersion
|
minSdkVersion rootProject.ext.minSdkVersion
|
||||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||||
versionCode 6
|
versionCode 9
|
||||||
versionName "0.1.6"
|
versionName "0.1.9"
|
||||||
}
|
}
|
||||||
signingConfigs {
|
signingConfigs {
|
||||||
debug {
|
debug {
|
||||||
|
|
|
||||||
|
|
@ -16,15 +16,15 @@
|
||||||
</queries>
|
</queries>
|
||||||
<application android:name=".MainApplication" android:label="@string/app_name" android:icon="@mipmap/ic_launcher" android:roundIcon="@mipmap/ic_launcher_round" android:allowBackup="true" android:theme="@style/AppTheme" android:supportsRtl="true" android:usesCleartextTraffic="false" android:networkSecurityConfig="@xml/network_security_config">
|
<application android:name=".MainApplication" android:label="@string/app_name" android:icon="@mipmap/ic_launcher" android:roundIcon="@mipmap/ic_launcher_round" android:allowBackup="true" android:theme="@style/AppTheme" android:supportsRtl="true" android:usesCleartextTraffic="false" android:networkSecurityConfig="@xml/network_security_config">
|
||||||
<!--
|
<!--
|
||||||
Expo Updates is disabled by default in the public source tree. To enable
|
Expo Updates — ENABLED 2026-05-22 dla public release. Manifest serwowany
|
||||||
OTA updates for your fork, flip ENABLED to "true" and point EXPO_UPDATE_URL
|
przez backend `/expo-updates/manifest` (api.goon-foss.org). Nowe JS-only
|
||||||
at your backend's `/expo-updates/manifest` endpoint. See README "Quick start"
|
fixy idą OTA bez rebuilda APK; native change wymaga bumpa runtimeVersion
|
||||||
for the server-side setup.
|
+ nowego APK przez PackageInstaller.
|
||||||
-->
|
-->
|
||||||
<meta-data android:name="expo.modules.updates.ENABLED" android:value="false"/>
|
<meta-data android:name="expo.modules.updates.ENABLED" android:value="true"/>
|
||||||
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_CHECK_ON_LAUNCH" android:value="ALWAYS"/>
|
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_CHECK_ON_LAUNCH" android:value="ALWAYS"/>
|
||||||
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_LAUNCH_WAIT_MS" android:value="0"/>
|
<meta-data android:name="expo.modules.updates.EXPO_UPDATES_LAUNCH_WAIT_MS" android:value="0"/>
|
||||||
<meta-data android:name="expo.modules.updates.EXPO_UPDATE_URL" android:value="https://invalid.example.invalid/expo-updates/manifest"/>
|
<meta-data android:name="expo.modules.updates.EXPO_UPDATE_URL" android:value="https://api.goon-foss.org/expo-updates/manifest"/>
|
||||||
<meta-data android:name="expo.modules.updates.EXPO_RUNTIME_VERSION" android:value="1.0"/>
|
<meta-data android:name="expo.modules.updates.EXPO_RUNTIME_VERSION" android:value="1.0"/>
|
||||||
<activity android:name=".MainActivity" android:configChanges="keyboard|keyboardHidden|orientation|screenSize|screenLayout|uiMode" android:launchMode="singleTask" android:windowSoftInputMode="adjustResize" android:theme="@style/Theme.App.SplashScreen" android:exported="true" android:screenOrientation="portrait">
|
<activity android:name=".MainActivity" android:configChanges="keyboard|keyboardHidden|orientation|screenSize|screenLayout|uiMode" android:launchMode="singleTask" android:windowSoftInputMode="adjustResize" android:theme="@style/Theme.App.SplashScreen" android:exported="true" android:screenOrientation="portrait">
|
||||||
<intent-filter>
|
<intent-filter>
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,14 @@
|
||||||
"expo": {
|
"expo": {
|
||||||
"name": "goon",
|
"name": "goon",
|
||||||
"slug": "goon",
|
"slug": "goon",
|
||||||
"version": "0.1.8",
|
"version": "0.1.9",
|
||||||
"orientation": "portrait",
|
"orientation": "portrait",
|
||||||
"userInterfaceStyle": "automatic",
|
"userInterfaceStyle": "automatic",
|
||||||
"newArchEnabled": false,
|
"newArchEnabled": false,
|
||||||
"runtimeVersion": "1.0",
|
"runtimeVersion": "1.0",
|
||||||
"updates": {
|
"updates": {
|
||||||
"enabled": false,
|
"enabled": true,
|
||||||
"url": "https://invalid.example.invalid/expo-updates/manifest",
|
"url": "https://api.goon-foss.org/expo-updates/manifest",
|
||||||
"checkAutomatically": "ON_LOAD",
|
"checkAutomatically": "ON_LOAD",
|
||||||
"fallbackToCacheTimeout": 0
|
"fallbackToCacheTimeout": 0
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -17,4 +17,4 @@ import Constants from 'expo-constants';
|
||||||
* też nie idzie do góry, więc consistency jest zachowana.
|
* też nie idzie do góry, więc consistency jest zachowana.
|
||||||
*/
|
*/
|
||||||
export const APP_VERSION: string =
|
export const APP_VERSION: string =
|
||||||
(Constants.expoConfig?.version as string | undefined) || '0.1.8';
|
(Constants.expoConfig?.version as string | undefined) || '0.1.9';
|
||||||
|
|
|
||||||
12
mobile/src/lib/backend.ts
Normal file
12
mobile/src/lib/backend.ts
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
// Default public instance. A fresh install with no stored credentials
|
||||||
|
// auto-connects here, so the app works out-of-the-box without a login step.
|
||||||
|
//
|
||||||
|
// Power users who want their own self-hosted backend can still override:
|
||||||
|
// after "Sign out" the login screen lets them enter a different URL + key.
|
||||||
|
//
|
||||||
|
// The API key below is intentionally shipped in the APK. It is a coarse
|
||||||
|
// bot/scraper filter, not a secret — anyone can decompile the APK to read it.
|
||||||
|
// If it gets abused, rotate it: append a new key to API_KEYS on the server,
|
||||||
|
// ship an APK update, then drop the old key.
|
||||||
|
export const DEFAULT_BACKEND_URL = 'https://api.goon-foss.org';
|
||||||
|
export const DEFAULT_API_KEY = 'W20ggQgYjH_evCZCSBTWJsGgLMaJQP_7';
|
||||||
|
|
@ -754,6 +754,40 @@ const INJECTED_JS = `
|
||||||
};
|
};
|
||||||
} catch (e) {}
|
} catch (e) {}
|
||||||
|
|
||||||
|
// -- 1.5. Cookie/consent auto-dismiss --------------------------------------
|
||||||
|
// Tube'y typu hqporner mają cookie-consent gate ("Allow All / Allow Essential
|
||||||
|
// Only") który blokuje kt_player JS — player nie inicjalizuje się dopóki user
|
||||||
|
// nie kliknie. INJECTED_JS scrape \`<source>.src\` odpala się więc za wcześnie
|
||||||
|
// (DOM nie ma jeszcze video). Auto-klikamy consent żeby odblokować player.
|
||||||
|
//
|
||||||
|
// Bezpieczeństwo: klikamy TYLKO element którego tekst pasuje do consent-frazy
|
||||||
|
// ORAZ leży w kontenerze z markerem cookie/consent/gdpr (≤6 przodków). To
|
||||||
|
// wyklucza przypadkowy klik w reklamę "Continue to site".
|
||||||
|
const CONSENT_TEXT_RE = /^(allow all|accept all|accept|accept & continue|accept and continue|i accept|i agree|agree|agree & continue|got it|enable all|consent|continue|ok|akceptuj.*|zgadzam.*|zgoda|rozumiem|wyra(z|ż)am zgod)$/i;
|
||||||
|
const CONSENT_CTX_RE = /(cookie|consent|gdpr|privacy|cmp|onetrust|didomi|cookiebar|cookie-?notice)/i;
|
||||||
|
const dismissConsent = function() {
|
||||||
|
const els = document.querySelectorAll('button, a, [role="button"], div[onclick], span[onclick], input[type="button"], input[type="submit"]');
|
||||||
|
for (let i = 0; i < els.length; i++) {
|
||||||
|
const el = els[i];
|
||||||
|
const txt = ((el.textContent || el.value || '') + '').trim();
|
||||||
|
if (!txt || txt.length > 32) continue;
|
||||||
|
if (!CONSENT_TEXT_RE.test(txt)) continue;
|
||||||
|
// Kontekst: element lub ≤6 przodków ma cookie/consent marker (class/id).
|
||||||
|
let ctx = el, depth = 0, inCtx = false;
|
||||||
|
while (ctx && depth < 7) {
|
||||||
|
const cn = ctx.className;
|
||||||
|
const sig = ((typeof cn === 'string' ? cn : (cn && cn.baseVal) || '') + ' ' + (ctx.id || '')).toLowerCase();
|
||||||
|
if (CONSENT_CTX_RE.test(sig)) { inCtx = true; break; }
|
||||||
|
ctx = ctx.parentElement; depth++;
|
||||||
|
}
|
||||||
|
if (!inCtx) continue;
|
||||||
|
try {
|
||||||
|
el.click();
|
||||||
|
window.ReactNativeWebView.postMessage(JSON.stringify({type: 'consent_dismissed'}));
|
||||||
|
} catch (e) {}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Niektóre hostery wstrzykują full-screen <iframe> jako ad — usuwamy periodically.
|
// Niektóre hostery wstrzykują full-screen <iframe> jako ad — usuwamy periodically.
|
||||||
// Plus iframe-ad już istniejące przed naszym patchowaniem (race condition).
|
// Plus iframe-ad już istniejące przed naszym patchowaniem (race condition).
|
||||||
const removeAdIframes = function() {
|
const removeAdIframes = function() {
|
||||||
|
|
@ -778,7 +812,13 @@ const INJECTED_JS = `
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
setInterval(removeAdIframes, 1000);
|
setInterval(function() {
|
||||||
|
removeAdIframes();
|
||||||
|
dismissConsent();
|
||||||
|
}, 1000);
|
||||||
|
// Pierwsza próba consent natychmiast (banner bywa w SSR HTML) — bez czekania
|
||||||
|
// na pierwszy tick interwału.
|
||||||
|
dismissConsent();
|
||||||
|
|
||||||
// -- 2. Auto-extract m3u8/mp4 -----------------------------------------------
|
// -- 2. Auto-extract m3u8/mp4 -----------------------------------------------
|
||||||
const VIDEO_RE = /https?:\\/\\/[^"'\\s<>]+\\.(?:m3u8|mp4|mpd)(?:\\?[^"'\\s<>]*)?/i;
|
const VIDEO_RE = /https?:\\/\\/[^"'\\s<>]+\\.(?:m3u8|mp4|mpd)(?:\\?[^"'\\s<>]*)?/i;
|
||||||
|
|
@ -821,9 +861,12 @@ const INJECTED_JS = `
|
||||||
} catch (e) {}
|
} catch (e) {}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// Jeśli mamy video URL i video się odpaliło, możemy zatrzymać polling
|
// Jeśli mamy video URL i video się odpaliło, możemy zatrzymać polling.
|
||||||
if (seen.size > 0 && ticks > 5) clearInterval(interval);
|
// Próg podniesiony 5→15: po auto-dismiss cookie consent kt_player (hqporner)
|
||||||
if (ticks > 60) clearInterval(interval);
|
// potrzebuje kilku sekund na init — zbyt wczesny stop łapał tylko preroll-ad
|
||||||
|
// URL zanim pojawił się prawdziwy <source>. 15 ticków = ~15s retry window.
|
||||||
|
if (seen.size > 0 && ticks > 15) clearInterval(interval);
|
||||||
|
if (ticks > 90) clearInterval(interval);
|
||||||
}, 1000);
|
}, 1000);
|
||||||
|
|
||||||
true;
|
true;
|
||||||
|
|
|
||||||
50
scripts/check_all_hosters.py
Normal file
50
scripts/check_all_hosters.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
"""Per-origin extractor check: dla 1 sample sceny z każdego tube origin,
|
||||||
|
wywołaj try_extract i sklasyfikuj wynik (direct mp4/m3u8 vs WebView hoster vs fail).
|
||||||
|
Uruchamiać na VPS: docker compose exec -T api python scripts/check_all_hosters.py
|
||||||
|
"""
|
||||||
|
from app.db import SessionLocal
|
||||||
|
from sqlalchemy import text
|
||||||
|
from app.extractors import try_extract
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with SessionLocal() as s:
|
||||||
|
rows = s.execute(text("""
|
||||||
|
SELECT DISTINCT ON (ps.origin)
|
||||||
|
ps.origin, ps.page_url, sc.title
|
||||||
|
FROM playback_sources ps
|
||||||
|
JOIN scenes sc ON sc.id = ps.scene_id
|
||||||
|
WHERE ps.dead_at IS NULL AND ps.origin LIKE 'tube:%'
|
||||||
|
AND ps.page_url IS NOT NULL
|
||||||
|
ORDER BY ps.origin, sc.created_at DESC
|
||||||
|
""")).all()
|
||||||
|
|
||||||
|
print(f"{'origin':<26} {'result':<48} verdict")
|
||||||
|
print("-" * 95)
|
||||||
|
for r in rows:
|
||||||
|
sitetag = r.origin.replace("tube:", "")
|
||||||
|
try:
|
||||||
|
sources = try_extract(sitetag, r.page_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{r.origin:<26} EXC: {str(e)[:42]:<48} ERROR")
|
||||||
|
continue
|
||||||
|
if not sources:
|
||||||
|
print(f"{r.origin:<26} {'None (no sources)':<48} FAIL")
|
||||||
|
continue
|
||||||
|
# Klasyfikacja po type pierwszego źródła
|
||||||
|
types = [getattr(x, "type", "?") for x in sources]
|
||||||
|
first = sources[0]
|
||||||
|
t = getattr(first, "type", "?")
|
||||||
|
link = (getattr(first, "link", "") or "")[:40]
|
||||||
|
if t == "hoster":
|
||||||
|
verdict = "WEBVIEW (page → ad risk)"
|
||||||
|
elif t in ("mp4", "m3u8", "hls", "mpd"):
|
||||||
|
verdict = "DIRECT (native ExoPlayer)"
|
||||||
|
else:
|
||||||
|
verdict = f"OTHER({t})"
|
||||||
|
n = len(sources)
|
||||||
|
print(f"{r.origin:<26} {f'{t} x{n} {link}':<48} {verdict}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
29
scripts/check_series_detector.py
Normal file
29
scripts/check_series_detector.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
"""Quick sanity check series-mismatch detector na realnych pendingach z bazy."""
|
||||||
|
from app.resolve.scoring import (
|
||||||
|
detect_modifier_tags,
|
||||||
|
detect_series_positions,
|
||||||
|
series_mismatch_strength,
|
||||||
|
)
|
||||||
|
|
||||||
|
cases = [
|
||||||
|
("pleasureville a dp xxx parody episode 2", "pleasureville a dp xxx parody episode 4", "Episode 2/4"),
|
||||||
|
("make em sweat #7", "make em sweat #7 bts", "BTS asymmetric"),
|
||||||
|
("training ravyn", "training ravyn (bts - 1)", "BTS asymmetric"),
|
||||||
|
("women seeking women volume 140 scene 3", "women seeking women volume 140 scene 4", "Vol same scene diff"),
|
||||||
|
("women seeking women #131 scene 2", "women seeking women volume 139 scene 1", "Multi num"),
|
||||||
|
("bad bella stinky feet preparation 1080p", "bad bella stinky feet preparation (unedited) 1080p", "Unedited"),
|
||||||
|
("alexis fawx step son becomes a man part 1", "alexis fawx step son becomes a man part 2", "Part 1/2"),
|
||||||
|
("neon moonlight pt. 1", "neon moonlight pt. 2", "Pt 1/2"),
|
||||||
|
("internet outage poundage", "internet outage poundage alexis fawx", "Same scene"),
|
||||||
|
("the great heist", "the great heist", "Identical"),
|
||||||
|
("training ravyn", "training ravyn", "Identical"),
|
||||||
|
("slut hunt ep.6 ravyn", "slut hunt ep.6 ravyn full", "Same Episode 6"),
|
||||||
|
]
|
||||||
|
print(f'{"strength":>8} case')
|
||||||
|
for a, b, desc in cases:
|
||||||
|
s = series_mismatch_strength(a, b)
|
||||||
|
pa = detect_series_positions(a)
|
||||||
|
pb = detect_series_positions(b)
|
||||||
|
ma = detect_modifier_tags(a)
|
||||||
|
mb = detect_modifier_tags(b)
|
||||||
|
print(f'{s:>8.2f} {desc:25s} pos={pa or "{}"} vs {pb or "{}"} mod={ma or "{}"} vs {mb or "{}"}')
|
||||||
92
scripts/goon_debug_proxy.py
Normal file
92
scripts/goon_debug_proxy.py
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
"""Debug reverse-proxy: http://0.0.0.0:8099 → https://api.goon-foss.org
|
||||||
|
|
||||||
|
Emulator app (via http://10.0.2.2:8099, cleartext dozwolony w NSC dla 10.0.2.2)
|
||||||
|
uderza ten proxy → forward do prawdziwego backendu. Loguje każdy request:
|
||||||
|
method, path, headers (X-API-Key, X-App-Signature), response status.
|
||||||
|
|
||||||
|
Cel: zdiagnozować czy app fetch w ogóle działa + jakie headers wysyła.
|
||||||
|
"""
|
||||||
|
import http.server
|
||||||
|
import socketserver
|
||||||
|
import ssl
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
|
UPSTREAM = "https://api.goon-foss.org"
|
||||||
|
PORT = 8099
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
|
protocol_version = "HTTP/1.1"
|
||||||
|
|
||||||
|
def _proxy(self, method):
|
||||||
|
body_len = int(self.headers.get("Content-Length", 0))
|
||||||
|
body = self.rfile.read(body_len) if body_len else None
|
||||||
|
|
||||||
|
print(f"\n>>> {method} {self.path}")
|
||||||
|
for h in ("X-API-Key", "X-App-Signature", "Authorization", "User-Agent", "Accept", "Content-Type"):
|
||||||
|
if h in self.headers:
|
||||||
|
val = self.headers[h]
|
||||||
|
if h == "X-App-Signature":
|
||||||
|
print(f" {h}: {val[:20]}...{val[-8:]} (len={len(val)})")
|
||||||
|
elif h == "X-API-Key":
|
||||||
|
print(f" {h}: {val[:8]}... (len={len(val)})")
|
||||||
|
else:
|
||||||
|
print(f" {h}: {val}")
|
||||||
|
|
||||||
|
url = UPSTREAM + self.path
|
||||||
|
req = urllib.request.Request(url, data=body, method=method)
|
||||||
|
for k, v in self.headers.items():
|
||||||
|
if k.lower() not in ("host", "content-length", "connection", "accept-encoding"):
|
||||||
|
req.add_header(k, v)
|
||||||
|
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp:
|
||||||
|
data = resp.read()
|
||||||
|
print(f"<<< {resp.status} ({len(data)} bytes)")
|
||||||
|
self.send_response(resp.status)
|
||||||
|
for k, v in resp.headers.items():
|
||||||
|
if k.lower() not in ("transfer-encoding", "connection", "content-encoding", "content-length"):
|
||||||
|
self.send_header(k, v)
|
||||||
|
self.send_header("Content-Length", str(len(data)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(data)
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
data = e.read()
|
||||||
|
print(f"<<< HTTP {e.code} ({len(data)} bytes): {data[:200]}")
|
||||||
|
self.send_response(e.code)
|
||||||
|
self.send_header("Content-Type", e.headers.get("Content-Type", "application/json"))
|
||||||
|
self.send_header("Content-Length", str(len(data)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"<<< PROXY ERROR: {type(e).__name__}: {e}")
|
||||||
|
msg = f'{{"detail":"proxy error: {e}"}}'.encode()
|
||||||
|
self.send_response(502)
|
||||||
|
self.send_header("Content-Type", "application/json")
|
||||||
|
self.send_header("Content-Length", str(len(msg)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(msg)
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
self._proxy("GET")
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
self._proxy("POST")
|
||||||
|
|
||||||
|
def do_DELETE(self):
|
||||||
|
self._proxy("DELETE")
|
||||||
|
|
||||||
|
def log_message(self, *args):
|
||||||
|
pass # silence default logging
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadingServer(socketserver.ThreadingMixIn, http.server.HTTPServer):
|
||||||
|
daemon_threads = True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(f"debug proxy: http://0.0.0.0:{PORT} -> {UPSTREAM}")
|
||||||
|
print(f"emulator app should point to http://10.0.2.2:{PORT}")
|
||||||
|
ThreadingServer(("0.0.0.0", PORT), ProxyHandler).serve_forever()
|
||||||
59
scripts/test_porndoe_scraper.py
Normal file
59
scripts/test_porndoe_scraper.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
"""Smoke test PornDoeScraper — fetch sample + sprawdz parsing."""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers.porndoe import PornDoeScraper
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
scraper = PornDoeScraper()
|
||||||
|
print(f"sitetag: {scraper.sitetag}")
|
||||||
|
print(f"listing url p1: {scraper._listing_url(1)}")
|
||||||
|
print(f"listing url p2: {scraper._listing_url(2)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
ok_studio = ok_perf = ok_date = ok_dur = ok_thumb = ok_phash = 0
|
||||||
|
for scene in scraper.latest_scenes(max_pages=1):
|
||||||
|
count += 1
|
||||||
|
if scene.studio:
|
||||||
|
ok_studio += 1
|
||||||
|
if scene.performers:
|
||||||
|
ok_perf += 1
|
||||||
|
if scene.release_date:
|
||||||
|
ok_date += 1
|
||||||
|
if scene.duration_sec:
|
||||||
|
ok_dur += 1
|
||||||
|
if scene.playback_sources and scene.playback_sources[0].thumbnail_url:
|
||||||
|
ok_thumb += 1
|
||||||
|
if scene.fingerprints:
|
||||||
|
ok_phash += 1
|
||||||
|
if count <= 5:
|
||||||
|
print(f"--- scene {count} ---")
|
||||||
|
print(f" ext_id: {scene.external_id}")
|
||||||
|
print(f" title: {scene.title[:60]}")
|
||||||
|
print(f" studio: {scene.studio.name if scene.studio else None}")
|
||||||
|
print(f" perf: {[p.name for p in scene.performers]}")
|
||||||
|
print(f" date: {scene.release_date}")
|
||||||
|
print(f" duration: {scene.duration_sec}s")
|
||||||
|
print(f" tags: {[t.name for t in scene.tags][:5]}")
|
||||||
|
print(f" thumb: {(scene.playback_sources[0].thumbnail_url or '')[:70]}")
|
||||||
|
print(f" phash: {[f.value for f in scene.fingerprints]}")
|
||||||
|
print()
|
||||||
|
if count >= 15:
|
||||||
|
break
|
||||||
|
|
||||||
|
print("=" * 50)
|
||||||
|
print(f"total scraped: {count}")
|
||||||
|
if count:
|
||||||
|
print(f" studio: {ok_studio}/{count} ({100*ok_studio//count}%)")
|
||||||
|
print(f" performer: {ok_perf}/{count} ({100*ok_perf//count}%)")
|
||||||
|
print(f" date: {ok_date}/{count} ({100*ok_date//count}%)")
|
||||||
|
print(f" duration: {ok_dur}/{count} ({100*ok_dur//count}%)")
|
||||||
|
print(f" thumbnail: {ok_thumb}/{count} ({100*ok_thumb//count}%)")
|
||||||
|
print(f" phash: {ok_phash}/{count} ({100*ok_phash//count}%)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
29
scripts/theporndude_coverage_check.py
Normal file
29
scripts/theporndude_coverage_check.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
"""Coverage check: ile tube'ów z theporndude.com mamy już w bazie."""
|
||||||
|
from app.db import SessionLocal
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with SessionLocal() as s:
|
||||||
|
# Wszystkie distinct origins (canonical + tube: + pornapp:)
|
||||||
|
rows = s.execute(text("""
|
||||||
|
SELECT origin, COUNT(*) AS n,
|
||||||
|
COUNT(*) FILTER (WHERE dead_at IS NULL) AS live,
|
||||||
|
COUNT(*) FILTER (WHERE dead_at IS NOT NULL) AS dead
|
||||||
|
FROM playback_sources
|
||||||
|
GROUP BY origin
|
||||||
|
ORDER BY origin
|
||||||
|
""")).all()
|
||||||
|
print(f"distinct origins: {len(rows)}")
|
||||||
|
by_kind = {}
|
||||||
|
for r in rows:
|
||||||
|
kind = r.origin.split(":")[0] if ":" in r.origin else "other"
|
||||||
|
by_kind.setdefault(kind, []).append((r.origin, r.n, r.live, r.dead))
|
||||||
|
for kind, items in by_kind.items():
|
||||||
|
print(f"\n=== {kind} ({len(items)} origins) ===")
|
||||||
|
for origin, n, live, dead in sorted(items, key=lambda x: -x[2]):
|
||||||
|
print(f" {origin:<35} n={n:>7,} live={live:>7,} dead={dead:>5,}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
118
scripts/theporndude_coverage_match.py
Normal file
118
scripts/theporndude_coverage_match.py
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
"""Cross-check 166 resolved theporndude domains vs nasze 25 tube origins."""
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Origins z DB (live + dead) + extractor REGISTRY w app/extractors/__init__.py
|
||||||
|
OUR_ORIGINS = [
|
||||||
|
# DB live + dead
|
||||||
|
"tube:0dayxxcom", "tube:epornercom", "tube:fpoxxx", "tube:freshpornoorg",
|
||||||
|
"tube:hqpornercom", "tube:latestpornvideocom", "tube:mypornerleakcom",
|
||||||
|
"tube:perverzijacom", "tube:porn00org", "tube:porndishcom", "tube:porndittcom",
|
||||||
|
"tube:pornhatcom", "tube:pornhubcom", "tube:porntrexcom", "tube:pornxpph",
|
||||||
|
"tube:redtubecom", "tube:sxylandcom", "tube:sxyprncom", "tube:xhamstercom",
|
||||||
|
"tube:xnxxcom", "tube:xvideoscom", "tube:youporncom", "tube:latestleaksco",
|
||||||
|
"tube:siskavideo", "tube:hdporn92com",
|
||||||
|
# REGISTRY only (extractor known, brak playback w live DB)
|
||||||
|
"tube:xmoviesforyoucom", "tube:watchporn", "tube:porn4dayspw",
|
||||||
|
"tube:paradisehillcc",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Tylko realne TLD-y. NIE "tube"/"porn"/"xxx" bo to często części nazwy (redtube, pornhub, fpoxxx).
|
||||||
|
_TLD_RE = __import__("re").compile(r"(com|net|org|tv|cc|pw|co|to|ws|me|sx|info|biz)$")
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_tld(s: str) -> str:
|
||||||
|
"""xvideoscom -> xvideos; pornhubcom -> pornhub; hdporn92com -> hdporn92"""
|
||||||
|
return _TLD_RE.sub("", s)
|
||||||
|
|
||||||
|
# Build sitetag → matching variants for fuzzy match
|
||||||
|
def origin_to_sitetag(origin: str) -> str:
|
||||||
|
return origin.replace("tube:", "")
|
||||||
|
|
||||||
|
|
||||||
|
def domain_to_sitetag(domain: str) -> str:
|
||||||
|
"""xvideos.com -> xvideoscom, porntrex.com -> porntrexcom"""
|
||||||
|
return domain.lower().replace(".", "").replace("-", "")
|
||||||
|
|
||||||
|
|
||||||
|
def match(slug: str, domain: str) -> str | None:
|
||||||
|
"""Match po `slug` (z theporndude review URL) lub `real_domain` (z pdude.link).
|
||||||
|
Slug to nazwa tube'a (np. 'xvideos', 'pornhub', 'paradisehill').
|
||||||
|
Origin format: tube:<sitetag>, gdzie sitetag = domain.replace('.', '').
|
||||||
|
Match na "slug pasuje do sitetag bez TLD" daje dobry recall.
|
||||||
|
"""
|
||||||
|
candidates = []
|
||||||
|
if slug:
|
||||||
|
candidates.append(slug.lower().replace("-", ""))
|
||||||
|
if domain:
|
||||||
|
candidates.append(domain_to_sitetag(domain))
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for o in OUR_ORIGINS:
|
||||||
|
st = origin_to_sitetag(o)
|
||||||
|
st_no_tld = _strip_tld(st)
|
||||||
|
for c in candidates:
|
||||||
|
c_no_tld = _strip_tld(c)
|
||||||
|
if c_no_tld == st_no_tld and len(c_no_tld) >= 3:
|
||||||
|
return o
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
data = json.loads(Path("theporndude_resolved.json").read_text())
|
||||||
|
have = []
|
||||||
|
new = []
|
||||||
|
error = []
|
||||||
|
for r in data:
|
||||||
|
if "error" in r and not r.get("real_domain"):
|
||||||
|
error.append(r)
|
||||||
|
continue
|
||||||
|
domain = r.get("real_domain", "")
|
||||||
|
our = match(r.get("slug", ""), domain)
|
||||||
|
r["our_origin"] = our
|
||||||
|
if our:
|
||||||
|
have.append(r)
|
||||||
|
else:
|
||||||
|
new.append(r)
|
||||||
|
|
||||||
|
print(f"=== Coverage ===")
|
||||||
|
print(f"Total theporndude top-porn-tubes: {len(data)}")
|
||||||
|
print(f" Already in our DB: {len(have)}")
|
||||||
|
print(f" NEW (potential candidates): {len(new)}")
|
||||||
|
print(f" Errors: {len(error)}")
|
||||||
|
print()
|
||||||
|
print(f"=== Already have (matched) — top 30 by theporndude rank ===")
|
||||||
|
for r in sorted(have, key=lambda x: x["rank"])[:30]:
|
||||||
|
print(
|
||||||
|
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
|
||||||
|
f"{r['real_domain']:<28} -> {r['our_origin']}"
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
print(f"=== NEW candidates (not in DB) — top 60 by theporndude rank ===")
|
||||||
|
for r in sorted(new, key=lambda x: x["rank"])[:60]:
|
||||||
|
print(
|
||||||
|
f" #{r['rank']:>3} score={r.get('theporndude_score') or '?':>4} "
|
||||||
|
f"{r.get('real_domain') or '?':<30} ({r['slug']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Output detailed
|
||||||
|
summary = {
|
||||||
|
"total": len(data),
|
||||||
|
"already_have": [{"rank": r["rank"], "slug": r["slug"], "domain": r["real_domain"],
|
||||||
|
"score": r.get("theporndude_score"), "our_origin": r["our_origin"]}
|
||||||
|
for r in sorted(have, key=lambda x: x["rank"])],
|
||||||
|
"new_candidates": [{"rank": r["rank"], "slug": r["slug"], "domain": r.get("real_domain"),
|
||||||
|
"score": r.get("theporndude_score"),
|
||||||
|
"final_url": r.get("final_url", "")}
|
||||||
|
for r in sorted(new, key=lambda x: x["rank"])],
|
||||||
|
"errors": [{"rank": r["rank"], "slug": r["slug"], "error": r.get("error")}
|
||||||
|
for r in error],
|
||||||
|
}
|
||||||
|
Path("theporndude_coverage.json").write_text(json.dumps(summary, indent=2))
|
||||||
|
print(f"\n-> theporndude_coverage.json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
176
scripts/theporndude_curl_triage.py
Normal file
176
scripts/theporndude_curl_triage.py
Normal file
|
|
@ -0,0 +1,176 @@
|
||||||
|
"""Batch curl triage 144 nowych theporndude tubes:
|
||||||
|
- HEAD root domain (200/4xx/5xx/timeout?)
|
||||||
|
- GET / → check landing markers: video listing, sceny, login wall, redirect
|
||||||
|
- GET /latest, /videos, /tube/recent → check które listing path działa
|
||||||
|
- Wynik: per-slug status + landing markers + scene_url_pattern guess
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
COVERAGE_FILE = Path("theporndude_coverage.json")
|
||||||
|
OUT_FILE = Path("theporndude_triage.json")
|
||||||
|
|
||||||
|
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
|
||||||
|
|
||||||
|
# Common candidate landing paths
|
||||||
|
LANDING_PATHS = ["/", "/latest", "/latest-videos", "/recent", "/new", "/videos", "/category/new", "/top-rated"]
|
||||||
|
|
||||||
|
# Markers w HTML
|
||||||
|
SCENE_LINK_PATTERNS = [
|
||||||
|
r'<a[^>]+href="(/(?:video|videos|watch|v|scene|scenes|stream|movie|movies|view|play|porn|tube)/[^"]+)"',
|
||||||
|
r'<a[^>]+href="((?:https?:)?//[^/"]+/(?:video|videos|watch|v|scene|scenes|stream|movie|movies|view|play|porn|tube)/[^"]+)"',
|
||||||
|
]
|
||||||
|
META_MARKERS = [
|
||||||
|
(r'jsonld|json-ld|"@type"\s*:\s*"VideoObject"', "jsonld_video"),
|
||||||
|
(r'<meta\s+property="og:type"\s+content="video', "og_video"),
|
||||||
|
(r'<meta\s+name="description"\s+content="([^"]+)"', "meta_desc"),
|
||||||
|
(r'class="[^"]*\b(?:video|scene|episode)-?(?:item|card|tile|thumb)\b', "video_card"),
|
||||||
|
(r'class="[^"]*\b(?:performer|actress|model|pornstar)\b', "performer_marker"),
|
||||||
|
(r'class="[^"]*\b(?:studio|production|brand|channel)\b', "studio_marker"),
|
||||||
|
(r'class="[^"]*\b(?:duration|runtime|length)\b|<time\s+datetime=', "duration_marker"),
|
||||||
|
(r'\b(?:HLS|hls|m3u8|application/x-mpegURL)\b', "hls_marker"),
|
||||||
|
(r'(?:hlsmanifest|videoUrl|video_url|stream_url|streamUrl)\s*[:=]\s*["\']', "stream_url_marker"),
|
||||||
|
(r'login\s*required|create\s+account|sign\s+(?:in|up)|members\s+only|join\s+now\s+to\s+watch', "auth_wall"),
|
||||||
|
(r'<title>[^<]*\b(?:404|not\s+found|gone|domain)\b[^<]*</title>', "dead_404"),
|
||||||
|
(r'<meta[^>]+http-equiv="refresh"[^>]+url=', "meta_refresh"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_one(cli: httpx.AsyncClient, url: str) -> tuple[int, str]:
|
||||||
|
try:
|
||||||
|
r = await cli.get(url, headers={"User-Agent": UA}, follow_redirects=True)
|
||||||
|
return r.status_code, r.text[:200_000] # cap response
|
||||||
|
except httpx.ConnectError:
|
||||||
|
return -1, "conn_refused"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return -2, "timeout"
|
||||||
|
except Exception as e:
|
||||||
|
return -9, str(e)[:120]
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_html(html: str) -> dict:
|
||||||
|
found = {}
|
||||||
|
for pattern, name in META_MARKERS:
|
||||||
|
if re.search(pattern, html, re.IGNORECASE):
|
||||||
|
found[name] = True
|
||||||
|
# Scene link patterns
|
||||||
|
scene_links = []
|
||||||
|
for p in SCENE_LINK_PATTERNS:
|
||||||
|
for m in re.finditer(p, html, re.IGNORECASE):
|
||||||
|
scene_links.append(m.group(1)[:120])
|
||||||
|
if len(scene_links) >= 5:
|
||||||
|
break
|
||||||
|
if len(scene_links) >= 5:
|
||||||
|
break
|
||||||
|
if scene_links:
|
||||||
|
found["scene_link_samples"] = scene_links[:3]
|
||||||
|
# Unique pattern (path prefix po slash)
|
||||||
|
prefixes = set()
|
||||||
|
for link in scene_links:
|
||||||
|
parts = link.lstrip("/").split("/", 2)
|
||||||
|
if parts:
|
||||||
|
prefixes.add("/" + parts[0])
|
||||||
|
found["scene_path_prefixes"] = sorted(prefixes)
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
async def audit_one(cli: httpx.AsyncClient, slug: str, domain: str) -> dict:
|
||||||
|
"""Audit pojedynczego tube'a."""
|
||||||
|
out = {"slug": slug, "domain": domain}
|
||||||
|
|
||||||
|
# Próbuj https://<domain>/ root
|
||||||
|
if not domain or not re.match(r"^[\w\.-]+\.\w+$", domain):
|
||||||
|
out["error"] = "no_valid_domain"
|
||||||
|
return out
|
||||||
|
|
||||||
|
root_url = f"https://{domain}/"
|
||||||
|
status, html = await fetch_one(cli, root_url)
|
||||||
|
out["root_status"] = status
|
||||||
|
if status not in (200, 301, 302):
|
||||||
|
out["root_error"] = html[:80] if isinstance(html, str) else None
|
||||||
|
return out
|
||||||
|
|
||||||
|
out["root_findings"] = analyze_html(html)
|
||||||
|
# Heurystyka score 0-3
|
||||||
|
f = out["root_findings"]
|
||||||
|
score = 0
|
||||||
|
reasons = []
|
||||||
|
if f.get("jsonld_video"):
|
||||||
|
score += 1
|
||||||
|
reasons.append("jsonld_video")
|
||||||
|
if f.get("og_video"):
|
||||||
|
score += 1
|
||||||
|
reasons.append("og_video")
|
||||||
|
if f.get("video_card"):
|
||||||
|
score += 1
|
||||||
|
reasons.append("video_card")
|
||||||
|
if f.get("performer_marker"):
|
||||||
|
score += 1
|
||||||
|
reasons.append("performer_marker")
|
||||||
|
if f.get("studio_marker"):
|
||||||
|
score += 1
|
||||||
|
reasons.append("studio_marker")
|
||||||
|
if f.get("duration_marker"):
|
||||||
|
score += 0.5
|
||||||
|
if f.get("hls_marker") or f.get("stream_url_marker"):
|
||||||
|
score += 0.5
|
||||||
|
if f.get("scene_path_prefixes"):
|
||||||
|
score += 1
|
||||||
|
reasons.append(f"scene_paths={f['scene_path_prefixes']}")
|
||||||
|
if f.get("auth_wall"):
|
||||||
|
score -= 2
|
||||||
|
reasons.append("auth_wall")
|
||||||
|
if f.get("dead_404"):
|
||||||
|
score -= 5
|
||||||
|
reasons.append("dead_404")
|
||||||
|
if f.get("meta_refresh"):
|
||||||
|
score -= 1
|
||||||
|
reasons.append("meta_refresh")
|
||||||
|
out["heuristic_score"] = round(score, 1)
|
||||||
|
out["reasons"] = reasons
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
cov = json.loads(COVERAGE_FILE.read_text())
|
||||||
|
new_candidates = cov["new_candidates"]
|
||||||
|
print(f"audytuję {len(new_candidates)} nowych kandydatów…")
|
||||||
|
|
||||||
|
timeout = httpx.Timeout(15.0, connect=8.0)
|
||||||
|
limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
|
||||||
|
async with httpx.AsyncClient(timeout=timeout, limits=limits, http2=False) as cli:
|
||||||
|
sem = asyncio.Semaphore(12)
|
||||||
|
|
||||||
|
async def worker(r):
|
||||||
|
async with sem:
|
||||||
|
# Use slug or guess domain (most slug.com)
|
||||||
|
domain = r.get("domain") or ""
|
||||||
|
# Jeśli pdude.link daje porndudecams.com (interstitial), użyj <slug>.com
|
||||||
|
if not domain or "porndudecams" in domain:
|
||||||
|
domain = f"{r['slug'].lower()}.com"
|
||||||
|
return {**r, **(await audit_one(cli, r["slug"], domain))}
|
||||||
|
|
||||||
|
results = await asyncio.gather(*[worker(r) for r in new_candidates])
|
||||||
|
|
||||||
|
OUT_FILE.write_text(json.dumps(results, indent=2))
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
by_score = {}
|
||||||
|
for r in results:
|
||||||
|
s = r.get("heuristic_score", 0)
|
||||||
|
bucket = "5+" if s >= 5 else "3-5" if s >= 3 else "1-3" if s >= 1 else "<1"
|
||||||
|
by_score.setdefault(bucket, []).append(r)
|
||||||
|
print("\n=== Heurystyczny rozkład (canonical-fit) ===")
|
||||||
|
for b in ["5+", "3-5", "1-3", "<1"]:
|
||||||
|
if b in by_score:
|
||||||
|
print(f" {b:<5} {len(by_score[b])} tubów")
|
||||||
|
print(f"\n-> {OUT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
234
scripts/theporndude_movies_pipeline.py
Normal file
234
scripts/theporndude_movies_pipeline.py
Normal file
|
|
@ -0,0 +1,234 @@
|
||||||
|
"""Pełny pipeline dla theporndude /full-porn-movies-sites (94 tubes):
|
||||||
|
1. Resolve real domains (pdude.link follow, ale follow only 1 hop)
|
||||||
|
2. Coverage match vs nasze 25+ origins
|
||||||
|
3. Curl triage HTML markers
|
||||||
|
4. Per-tube scorecard
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
|
||||||
|
|
||||||
|
OUR_ORIGINS = [
|
||||||
|
"tube:0dayxxcom", "tube:epornercom", "tube:fpoxxx", "tube:freshpornoorg",
|
||||||
|
"tube:hqpornercom", "tube:latestpornvideocom", "tube:mypornerleakcom",
|
||||||
|
"tube:perverzijacom", "tube:porn00org", "tube:porndishcom", "tube:porndittcom",
|
||||||
|
"tube:pornhatcom", "tube:pornhubcom", "tube:porntrexcom", "tube:pornxpph",
|
||||||
|
"tube:redtubecom", "tube:sxylandcom", "tube:sxyprncom", "tube:xhamstercom",
|
||||||
|
"tube:xnxxcom", "tube:xvideoscom", "tube:youporncom", "tube:latestleaksco",
|
||||||
|
"tube:siskavideo", "tube:hdporn92com",
|
||||||
|
"tube:xmoviesforyoucom", "tube:watchporn", "tube:porn4dayspw",
|
||||||
|
"tube:paradisehillcc",
|
||||||
|
]
|
||||||
|
|
||||||
|
_TLD_RE = re.compile(r"(com|net|org|tv|cc|pw|co|to|ws|me|sx|info|biz)$")
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_tld(s: str) -> str:
|
||||||
|
return _TLD_RE.sub("", s)
|
||||||
|
|
||||||
|
|
||||||
|
def match(slug: str, domain: str) -> str | None:
|
||||||
|
candidates = []
|
||||||
|
if slug:
|
||||||
|
candidates.append(slug.lower().replace("-", ""))
|
||||||
|
if domain:
|
||||||
|
candidates.append(domain.lower().replace(".", "").replace("-", ""))
|
||||||
|
for o in OUR_ORIGINS:
|
||||||
|
st = o.replace("tube:", "")
|
||||||
|
st_no_tld = _strip_tld(st)
|
||||||
|
for c in candidates:
|
||||||
|
c_no_tld = _strip_tld(c)
|
||||||
|
if c_no_tld == st_no_tld and len(c_no_tld) >= 3:
|
||||||
|
return o
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
SCENE_PATH_RE = re.compile(
|
||||||
|
r'<a[^>]+href="((?:https?:)?//?[^"]*?/(?:video|videos|watch|v|scene|movie|movies|play|view|stream)/[^"]+)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
META_MARKERS = [
|
||||||
|
(r'"@type"\s*:\s*"VideoObject"', "jsonld_video"),
|
||||||
|
(r'<meta\s+property="og:type"\s+content="video', "og_video"),
|
||||||
|
(r'class="[^"]*\b(?:video|scene|movie|episode)-?(?:item|card|tile|thumb|block)\b', "video_card"),
|
||||||
|
(r'class="[^"]*\b(?:performer|actress|model|pornstar|cast)\b|href="[^"]*/pornstar', "performer_marker"),
|
||||||
|
(r'class="[^"]*\b(?:studio|production|brand|channel|network)\b|href="[^"]*/studio', "studio_marker"),
|
||||||
|
(r'class="[^"]*\b(?:duration|runtime|length)\b|itemprop="duration"', "duration_marker"),
|
||||||
|
(r'\b(?:HLS|m3u8|application/x-mpegURL)\b', "hls_marker"),
|
||||||
|
(r'(?:videoUrl|video_url|stream_url|streamUrl)\s*[:=]\s*["\']', "stream_url_marker"),
|
||||||
|
(r'(?:login\s+required|create\s+account|members\s+only|join\s+now)', "auth_wall"),
|
||||||
|
(r'<title>[^<]*\b(?:404|not\s+found|domain\s+for\s+sale|gone)\b', "dead_404"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_one(cli: httpx.AsyncClient, url: str, *, max_redirects: int = 5) -> tuple[int, str, str]:
|
||||||
|
try:
|
||||||
|
r = await cli.get(url, headers={"User-Agent": UA}, follow_redirects=False)
|
||||||
|
# Follow up to max_redirects but stop on cross-domain redirect-out (to detect pdude.link → ad)
|
||||||
|
hops = 0
|
||||||
|
first_external_domain = None
|
||||||
|
cur = r
|
||||||
|
cur_url = url
|
||||||
|
while cur.status_code in (301, 302, 303, 307, 308) and hops < max_redirects:
|
||||||
|
loc = cur.headers.get("location")
|
||||||
|
if not loc:
|
||||||
|
break
|
||||||
|
if loc.startswith("/"):
|
||||||
|
p = urlparse(cur_url)
|
||||||
|
loc = f"{p.scheme}://{p.netloc}{loc}"
|
||||||
|
cur_url = loc
|
||||||
|
hops += 1
|
||||||
|
# Track first external (non-pdude, non-theporndude)
|
||||||
|
host = urlparse(loc).hostname or ""
|
||||||
|
if first_external_domain is None and not host.endswith("pdude.link") and not host.endswith("theporndude.com"):
|
||||||
|
first_external_domain = host.replace("www.", "")
|
||||||
|
cur = await cli.get(loc, headers={"User-Agent": UA}, follow_redirects=False)
|
||||||
|
return cur.status_code, cur.text[:200_000] if hasattr(cur, "text") else "", first_external_domain or (urlparse(cur_url).hostname or "").replace("www.", "")
|
||||||
|
except httpx.ConnectError:
|
||||||
|
return -1, "conn_refused", ""
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return -2, "timeout", ""
|
||||||
|
except Exception as e:
|
||||||
|
return -9, str(e)[:120], ""
|
||||||
|
|
||||||
|
|
||||||
|
async def resolve_domain(cli: httpx.AsyncClient, slug: str) -> str:
|
||||||
|
"""Pdude.link follow z early-exit dla first external."""
|
||||||
|
try:
|
||||||
|
r = await cli.get(f"https://pdude.link/{slug}", headers={"User-Agent": UA}, follow_redirects=False)
|
||||||
|
loc = r.headers.get("location", "")
|
||||||
|
if loc:
|
||||||
|
host = urlparse(loc).hostname or ""
|
||||||
|
host = host.replace("www.", "")
|
||||||
|
# Jeśli pdude.link redirectuje na affiliate (anexo.link/awejmp.com/etc) — wyciągnij subAffId
|
||||||
|
if "anexo.link" in host or "awejmp.com" in host or "porndudecams" in host:
|
||||||
|
# Try slug.com fallback
|
||||||
|
return ""
|
||||||
|
return host
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_html(html: str) -> dict:
|
||||||
|
found = {}
|
||||||
|
for pattern, name in META_MARKERS:
|
||||||
|
if re.search(pattern, html, re.IGNORECASE):
|
||||||
|
found[name] = True
|
||||||
|
prefixes = set()
|
||||||
|
sample = []
|
||||||
|
for m in SCENE_PATH_RE.finditer(html):
|
||||||
|
link = m.group(1)
|
||||||
|
sample.append(link[:100])
|
||||||
|
# Wyciągnij prefix
|
||||||
|
# Normalize: //host/path → /path; otherwise full match
|
||||||
|
if link.startswith("//"):
|
||||||
|
link = "/" + link.split("/", 3)[3] if "/" in link[2:] else "/"
|
||||||
|
if link.startswith("/"):
|
||||||
|
parts = link.lstrip("/").split("/", 2)
|
||||||
|
if parts:
|
||||||
|
prefixes.add("/" + parts[0])
|
||||||
|
if len(sample) >= 5:
|
||||||
|
break
|
||||||
|
if prefixes:
|
||||||
|
found["scene_path_prefixes"] = sorted(prefixes)
|
||||||
|
if sample:
|
||||||
|
found["scene_link_samples"] = sample[:3]
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def score_findings(f: dict) -> tuple[float, list]:
|
||||||
|
score, reasons = 0.0, []
|
||||||
|
if f.get("jsonld_video"):
|
||||||
|
score += 1.5; reasons.append("jsonld_video")
|
||||||
|
if f.get("og_video"):
|
||||||
|
score += 0.5; reasons.append("og_video")
|
||||||
|
if f.get("video_card"):
|
||||||
|
score += 1; reasons.append("video_card")
|
||||||
|
if f.get("performer_marker"):
|
||||||
|
score += 1; reasons.append("performer_marker")
|
||||||
|
if f.get("studio_marker"):
|
||||||
|
score += 1; reasons.append("studio_marker")
|
||||||
|
if f.get("duration_marker"):
|
||||||
|
score += 0.5; reasons.append("duration_marker")
|
||||||
|
if f.get("hls_marker") or f.get("stream_url_marker"):
|
||||||
|
score += 0.5
|
||||||
|
if f.get("scene_path_prefixes"):
|
||||||
|
score += 1; reasons.append(f"paths={f['scene_path_prefixes']}")
|
||||||
|
if f.get("auth_wall"):
|
||||||
|
score -= 2; reasons.append("auth_wall")
|
||||||
|
if f.get("dead_404"):
|
||||||
|
score -= 5; reasons.append("dead_404")
|
||||||
|
return round(score, 1), reasons
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
movies = json.loads(Path("theporndude_movies.json").read_text())["all"]
|
||||||
|
print(f"audyt {len(movies)} tubów z full-porn-movies-sites…")
|
||||||
|
|
||||||
|
timeout = httpx.Timeout(15.0, connect=8.0)
|
||||||
|
async with httpx.AsyncClient(timeout=timeout, http2=False) as cli:
|
||||||
|
sem = asyncio.Semaphore(12)
|
||||||
|
|
||||||
|
async def worker(r):
|
||||||
|
async with sem:
|
||||||
|
slug = r["slug"]
|
||||||
|
# Resolve real domain z pdude.link first hop
|
||||||
|
domain = await resolve_domain(cli, slug)
|
||||||
|
if not domain or any(x in domain for x in ["anexo.link", "awejmp.com", "porndudecams"]):
|
||||||
|
domain = f"{slug.lower()}.com"
|
||||||
|
# Curl root + scene path heurystyka
|
||||||
|
status, html, _ = await fetch_one(cli, f"https://{domain}/")
|
||||||
|
findings = analyze_html(html) if status == 200 else {}
|
||||||
|
score, reasons = score_findings(findings)
|
||||||
|
our = match(slug, domain)
|
||||||
|
return {
|
||||||
|
**r,
|
||||||
|
"domain": domain,
|
||||||
|
"root_status": status,
|
||||||
|
"findings": findings,
|
||||||
|
"score": score,
|
||||||
|
"reasons": reasons,
|
||||||
|
"our_origin": our,
|
||||||
|
}
|
||||||
|
|
||||||
|
results = await asyncio.gather(*[worker(r) for r in movies])
|
||||||
|
|
||||||
|
# Aggregate
|
||||||
|
have = [r for r in results if r["our_origin"]]
|
||||||
|
new_promising = [r for r in results if not r["our_origin"] and r["score"] >= 2.5]
|
||||||
|
new_low = [r for r in results if not r["our_origin"] and 1 <= r["score"] < 2.5]
|
||||||
|
new_zero = [r for r in results if not r["our_origin"] and 0 < r["score"] < 1]
|
||||||
|
new_dead = [r for r in results if not r["our_origin"] and (r["root_status"] <= 0 or r["score"] < 0)]
|
||||||
|
new_no_signal = [r for r in results if not r["our_origin"] and r["score"] == 0 and r["root_status"] == 200]
|
||||||
|
|
||||||
|
print(f"\n=== Coverage /full-porn-movies-sites ({len(results)} tubes) ===")
|
||||||
|
print(f" already have: {len(have):>3}")
|
||||||
|
print(f" promising: {len(new_promising):>3}")
|
||||||
|
print(f" low value: {len(new_low):>3}")
|
||||||
|
print(f" no signal: {len(new_no_signal):>3}")
|
||||||
|
print(f" dead: {len(new_dead):>3}")
|
||||||
|
print()
|
||||||
|
print("ALREADY HAVE:")
|
||||||
|
for r in have:
|
||||||
|
print(f" {r['slug']:<20} -> {r['our_origin']}")
|
||||||
|
print()
|
||||||
|
print("PROMISING (score >= 2.5):")
|
||||||
|
for r in sorted(new_promising, key=lambda x: -x["score"]):
|
||||||
|
print(f" score={r['score']:>4} {r['domain']:<25} ({r['slug']:<20}) reasons={','.join(r['reasons'])[:60]}")
|
||||||
|
print()
|
||||||
|
print("LOW VALUE (1-2.5):")
|
||||||
|
for r in sorted(new_low, key=lambda x: -x["score"]):
|
||||||
|
print(f" score={r['score']:>4} {r['domain']:<25} ({r['slug']:<20}) reasons={','.join(r['reasons'])[:60]}")
|
||||||
|
|
||||||
|
Path("theporndude_movies_scorecard.json").write_text(json.dumps(results, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
87
scripts/theporndude_resolve_domains.py
Normal file
87
scripts/theporndude_resolve_domains.py
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
"""Per 166 review slugs z top-porn-tube-sites:
|
||||||
|
1. Fetch review page → extract pdude.link Visit URL + rating + score badges
|
||||||
|
2. Follow pdude.link → real tube domain
|
||||||
|
3. Cross-check vs nasze 25 tube origins
|
||||||
|
4. Output JSON: { slug, name, theporndude_rank, theporndude_score, real_domain, in_our_db, our_origin }
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
REVIEWS_FILE = Path("theporndude_free_tubes.json")
|
||||||
|
OUT_FILE = Path("theporndude_resolved.json")
|
||||||
|
|
||||||
|
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36"
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_review(cli: httpx.AsyncClient, review: dict, rank: int) -> dict:
|
||||||
|
url = f"https://theporndude.com/{review['id']}/{review['slug']}"
|
||||||
|
try:
|
||||||
|
r = await cli.get(url, headers={"User-Agent": UA})
|
||||||
|
html = r.text
|
||||||
|
except Exception as e:
|
||||||
|
return {**review, "rank": rank, "error": f"fetch_review: {e}"}
|
||||||
|
|
||||||
|
# Wyciągnij score
|
||||||
|
score_m = re.search(r'class="rate__num">\s*(\d+(?:\.\d+)?)\s*<', html)
|
||||||
|
# Wyciągnij pdude.link visit URL
|
||||||
|
pdude_m = re.search(r'href="(https://pdude\.link/[\w\-\.]+)"', html)
|
||||||
|
# Wyciągnij <title> + meta description
|
||||||
|
title_m = re.search(r"<title>([^<]+)</title>", html)
|
||||||
|
desc_m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html)
|
||||||
|
|
||||||
|
out = {
|
||||||
|
**review,
|
||||||
|
"rank": rank,
|
||||||
|
"theporndude_score": float(score_m.group(1)) if score_m else None,
|
||||||
|
"page_title": (title_m.group(1) if title_m else "")[:120],
|
||||||
|
"page_desc": (desc_m.group(1) if desc_m else "")[:200],
|
||||||
|
}
|
||||||
|
if not pdude_m:
|
||||||
|
out["error"] = "no_pdude_link"
|
||||||
|
return out
|
||||||
|
pdude_url = pdude_m.group(1)
|
||||||
|
|
||||||
|
# Follow pdude.link
|
||||||
|
try:
|
||||||
|
r2 = await cli.get(pdude_url, headers={"User-Agent": UA})
|
||||||
|
# Final URL po wszystkich redirectach
|
||||||
|
final_url = str(r2.url)
|
||||||
|
host = urlparse(final_url).hostname or ""
|
||||||
|
host = host.replace("www.", "")
|
||||||
|
out["real_domain"] = host
|
||||||
|
out["final_url"] = final_url[:200]
|
||||||
|
except Exception as e:
|
||||||
|
out["error"] = f"pdude_follow: {e}"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
reviews = json.loads(REVIEWS_FILE.read_text())["reviews"]
|
||||||
|
|
||||||
|
timeout = httpx.Timeout(20.0, connect=10.0)
|
||||||
|
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=timeout, limits=limits, follow_redirects=True, http2=False
|
||||||
|
) as cli:
|
||||||
|
sem = asyncio.Semaphore(8)
|
||||||
|
|
||||||
|
async def worker(rev, rank):
|
||||||
|
async with sem:
|
||||||
|
return await fetch_review(cli, rev, rank)
|
||||||
|
|
||||||
|
tasks = [worker(r, i + 1) for i, r in enumerate(reviews)]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
OUT_FILE.write_text(json.dumps(results, indent=2))
|
||||||
|
ok = sum(1 for r in results if r.get("real_domain"))
|
||||||
|
print(f"resolved {ok}/{len(results)} ({ok*100/len(results):.0f}%)")
|
||||||
|
print(f"out -> {OUT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
104
scripts/theporndude_scorecard.py
Normal file
104
scripts/theporndude_scorecard.py
Normal file
|
|
@ -0,0 +1,104 @@
|
||||||
|
"""Generuje końcowy scorecard JSON dla wszystkich 166 theporndude top-porn-tube-sites:
|
||||||
|
- coverage status (already_have/new/dead/low_value)
|
||||||
|
- canonical_value_score 0-5 (heurystyka + nasz ranking)
|
||||||
|
- recommendation: skip / consider / pilot / integrate
|
||||||
|
|
||||||
|
Plus markdown summary dla człowieka.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
COVERAGE = json.loads(Path("theporndude_coverage.json").read_text())
|
||||||
|
TRIAGE = json.loads(Path("theporndude_triage.json").read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
triage_by_slug = {r["slug"]: r for r in TRIAGE}
|
||||||
|
|
||||||
|
scorecards = []
|
||||||
|
for r in COVERAGE["already_have"]:
|
||||||
|
scorecards.append({
|
||||||
|
"rank": r["rank"],
|
||||||
|
"slug": r["slug"],
|
||||||
|
"domain": r["domain"],
|
||||||
|
"status": "already_have",
|
||||||
|
"our_origin": r["our_origin"],
|
||||||
|
"canonical_value_score": None,
|
||||||
|
"recommendation": "skip — already integrated",
|
||||||
|
})
|
||||||
|
for r in COVERAGE["new_candidates"]:
|
||||||
|
t = triage_by_slug.get(r["slug"], {})
|
||||||
|
score = t.get("heuristic_score", 0)
|
||||||
|
findings = t.get("root_findings", {})
|
||||||
|
reasons = t.get("reasons", [])
|
||||||
|
root_status = t.get("root_status", 0)
|
||||||
|
domain = t.get("domain") or r.get("domain") or f"{r['slug']}.com"
|
||||||
|
|
||||||
|
if root_status <= 0 or findings.get("dead_404"):
|
||||||
|
status = "dead"
|
||||||
|
rec = "skip — dead/unreachable"
|
||||||
|
elif findings.get("auth_wall") and score < 2:
|
||||||
|
status = "auth_wall"
|
||||||
|
rec = "skip — login required, no public scenes"
|
||||||
|
elif score >= 2.5:
|
||||||
|
status = "promising"
|
||||||
|
rec = "pilot — deep audit + write extractor"
|
||||||
|
elif score >= 1:
|
||||||
|
status = "low_value"
|
||||||
|
rec = "consider — basic metadata only, low priority"
|
||||||
|
else:
|
||||||
|
status = "no_value"
|
||||||
|
rec = "skip — no canonical-fit signal in HTML"
|
||||||
|
|
||||||
|
scorecards.append({
|
||||||
|
"rank": r["rank"],
|
||||||
|
"slug": r["slug"],
|
||||||
|
"domain": domain,
|
||||||
|
"status": status,
|
||||||
|
"our_origin": None,
|
||||||
|
"canonical_value_score": score,
|
||||||
|
"heuristic_reasons": reasons,
|
||||||
|
"findings": findings,
|
||||||
|
"recommendation": rec,
|
||||||
|
})
|
||||||
|
|
||||||
|
scorecards.sort(key=lambda x: x["rank"])
|
||||||
|
|
||||||
|
out = {
|
||||||
|
"source": "theporndude.com/top-porn-tube-sites",
|
||||||
|
"fetched_at": "2026-05-20",
|
||||||
|
"total": len(scorecards),
|
||||||
|
"summary": {
|
||||||
|
"already_have": sum(1 for s in scorecards if s["status"] == "already_have"),
|
||||||
|
"promising": sum(1 for s in scorecards if s["status"] == "promising"),
|
||||||
|
"low_value": sum(1 for s in scorecards if s["status"] == "low_value"),
|
||||||
|
"no_value": sum(1 for s in scorecards if s["status"] == "no_value"),
|
||||||
|
"auth_wall": sum(1 for s in scorecards if s["status"] == "auth_wall"),
|
||||||
|
"dead": sum(1 for s in scorecards if s["status"] == "dead"),
|
||||||
|
},
|
||||||
|
"scorecards": scorecards,
|
||||||
|
}
|
||||||
|
Path("theporndude_scorecard.json").write_text(json.dumps(out, indent=2))
|
||||||
|
|
||||||
|
# Pretty print summary
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"THEPORNDUDE.COM CANONICAL-FIT SCORECARD ({out['total']} tubes)")
|
||||||
|
print("=" * 70)
|
||||||
|
for k, v in out["summary"].items():
|
||||||
|
print(f" {k:<15} {v:>4} ({100*v/out['total']:.0f}%)")
|
||||||
|
print()
|
||||||
|
print("PROMISING (score >= 2.5) — pilot candidates:")
|
||||||
|
for s in scorecards:
|
||||||
|
if s["status"] == "promising":
|
||||||
|
r = ",".join(s.get("heuristic_reasons", []))[:60]
|
||||||
|
print(f" #{s['rank']:>3} score={s['canonical_value_score']:>4} {s['domain']:<25} ({s['slug']}) {r}")
|
||||||
|
print()
|
||||||
|
print("LOW_VALUE (1-2.5) — defer:")
|
||||||
|
for s in scorecards:
|
||||||
|
if s["status"] == "low_value":
|
||||||
|
r = ",".join(s.get("heuristic_reasons", []))[:50]
|
||||||
|
print(f" #{s['rank']:>3} score={s['canonical_value_score']:>4} {s['domain']:<25} ({s['slug']}) {r}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -9,9 +9,12 @@ import pytest
|
||||||
from app.resolve.scoring import (
|
from app.resolve.scoring import (
|
||||||
composite_score,
|
composite_score,
|
||||||
date_proximity,
|
date_proximity,
|
||||||
|
detect_modifier_tags,
|
||||||
|
detect_series_positions,
|
||||||
hamming_distance_hex,
|
hamming_distance_hex,
|
||||||
performer_set_similarity,
|
performer_set_similarity,
|
||||||
phash_similarity,
|
phash_similarity,
|
||||||
|
series_mismatch_strength,
|
||||||
title_similarity,
|
title_similarity,
|
||||||
triage,
|
triage,
|
||||||
)
|
)
|
||||||
|
|
@ -146,6 +149,112 @@ def test_composite_clamps_to_unit() -> None:
|
||||||
assert score == 1.0
|
assert score == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
# ---- triage --------------------------------------------------------------
|
||||||
|
|
||||||
|
# ---- series position / modifier detector ---------------------------------
|
||||||
|
|
||||||
|
def test_detect_series_positions_episode() -> None:
|
||||||
|
assert detect_series_positions("pleasureville a dp xxx parody episode 4") == {4}
|
||||||
|
|
||||||
|
def test_detect_series_positions_part_with_dot() -> None:
|
||||||
|
assert detect_series_positions("neon moonlight pt. 2") == {2}
|
||||||
|
|
||||||
|
def test_detect_series_positions_hash_only() -> None:
|
||||||
|
assert detect_series_positions("women seeking women #131 scene 2") == {131, 2}
|
||||||
|
|
||||||
|
def test_detect_series_positions_volume() -> None:
|
||||||
|
assert detect_series_positions("women seeking women volume 140 scene 3") == {140, 3}
|
||||||
|
|
||||||
|
def test_detect_series_positions_s_e_style() -> None:
|
||||||
|
assert detect_series_positions("can you handle a woman like me s9 e8") == {9, 8}
|
||||||
|
|
||||||
|
def test_detect_series_positions_empty() -> None:
|
||||||
|
assert detect_series_positions(None) == set()
|
||||||
|
assert detect_series_positions("") == set()
|
||||||
|
|
||||||
|
def test_detect_modifier_tags_bts() -> None:
|
||||||
|
assert "bts" in detect_modifier_tags("training ravyn (bts - 1)")
|
||||||
|
|
||||||
|
def test_detect_modifier_tags_behind_the_scenes() -> None:
|
||||||
|
assert "behind the scenes" in detect_modifier_tags(
|
||||||
|
"behind the scenes - two pairs of suckable melons"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_detect_modifier_tags_unedited() -> None:
|
||||||
|
assert "unedited" in detect_modifier_tags("bad bella stinky feet prep (unedited)")
|
||||||
|
|
||||||
|
def test_series_mismatch_episode_2_vs_4_hard() -> None:
|
||||||
|
# Episode 2 vs 4 → twardy mismatch (1.0)
|
||||||
|
s = series_mismatch_strength(
|
||||||
|
"pleasureville a dp xxx parody episode 2",
|
||||||
|
"pleasureville a dp xxx parody episode 4",
|
||||||
|
)
|
||||||
|
assert s == 1.0
|
||||||
|
|
||||||
|
def test_series_mismatch_intersection_is_no_mismatch() -> None:
|
||||||
|
# Oba mają {7} (Make'em Sweat #7) → BRAK mismatchu na pozycji,
|
||||||
|
# ale BTS asymmetry → 0.7
|
||||||
|
s = series_mismatch_strength("make'em sweat #7", "make'em sweat #7 bts")
|
||||||
|
assert s == pytest.approx(0.7)
|
||||||
|
|
||||||
|
def test_series_mismatch_partial_overlap_is_still_hard() -> None:
|
||||||
|
# "Volume 140 Scene 3" vs "Volume 140 Scene 4" — wspólny 140 ale różne 3/4,
|
||||||
|
# to są osobne sceny ze wspólnej kompilacji → hard split.
|
||||||
|
s = series_mismatch_strength(
|
||||||
|
"women seeking women volume 140 scene 3",
|
||||||
|
"women seeking women volume 140 scene 4",
|
||||||
|
)
|
||||||
|
assert s == 1.0
|
||||||
|
|
||||||
|
def test_series_mismatch_no_year_false_positive() -> None:
|
||||||
|
# "scene from 2020" nie może wygenerować fałszywej pozycji z roku.
|
||||||
|
pos = detect_series_positions("scene from 2020")
|
||||||
|
# Może tu być {2020}? Nie — \d{1,3} z anti-greedy boundary nie złapie 4-cyfr.
|
||||||
|
assert pos == set()
|
||||||
|
|
||||||
|
def test_series_mismatch_bts_asymmetric() -> None:
|
||||||
|
# Tytuły: Training Ravyn vs Training Ravyn (BTS - 1)
|
||||||
|
# pos: {} vs {1} → brak common pos ale jedna strona pusta → nie hard split
|
||||||
|
# BTS po jednej stronie → 0.7
|
||||||
|
s = series_mismatch_strength("training ravyn", "training ravyn (bts - 1)")
|
||||||
|
assert s == pytest.approx(0.7)
|
||||||
|
|
||||||
|
def test_series_mismatch_no_signal() -> None:
|
||||||
|
s = series_mismatch_strength("the great heist", "the great heist")
|
||||||
|
assert s == 0.0
|
||||||
|
|
||||||
|
def test_composite_series_position_hard_reject() -> None:
|
||||||
|
# Mimo wszystkich silnych sygnałów (fp/title/performers/date 1.0) — series mismatch
|
||||||
|
# 1.0 forsuje twardy reject. To gwarantuje że "Episode 2 vs Episode 4" z tym samym
|
||||||
|
# phashem (studio reuse cover art) NIE auto-mergeują.
|
||||||
|
score, reasons = composite_score(
|
||||||
|
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
|
||||||
|
studio_match=True, series_mismatch=1.0,
|
||||||
|
)
|
||||||
|
assert score == 0.0
|
||||||
|
assert reasons.get("series_position_mismatch")
|
||||||
|
|
||||||
|
def test_composite_series_modifier_cap_07() -> None:
|
||||||
|
# Modifier mismatch (BTS po jednej stronie) → cap = 1 - 0.7 = 0.3
|
||||||
|
score, reasons = composite_score(
|
||||||
|
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
|
||||||
|
studio_match=True, series_mismatch=0.7,
|
||||||
|
)
|
||||||
|
assert score == pytest.approx(0.3)
|
||||||
|
assert reasons.get("series_modifier_cap") == pytest.approx(0.3)
|
||||||
|
|
||||||
|
def test_composite_series_zero_no_effect() -> None:
|
||||||
|
score_a, _ = composite_score(
|
||||||
|
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
|
||||||
|
studio_match=True, series_mismatch=0.0,
|
||||||
|
)
|
||||||
|
score_b, _ = composite_score(
|
||||||
|
fp=1.0, title=1.0, performers=1.0, date_score=1.0,
|
||||||
|
studio_match=True, series_mismatch=None,
|
||||||
|
)
|
||||||
|
assert score_a == score_b == pytest.approx(1.0)
|
||||||
|
|
||||||
|
|
||||||
# ---- triage --------------------------------------------------------------
|
# ---- triage --------------------------------------------------------------
|
||||||
|
|
||||||
def test_triage_thresholds() -> None:
|
def test_triage_thresholds() -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue