Compare commits
10 commits
2f3e57c0ac
...
05a35955ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
05a35955ad | ||
|
|
813bf741b9 | ||
|
|
585e5d59f5 | ||
|
|
9a789a8551 | ||
|
|
1ca503b7be | ||
|
|
2051fc1ded | ||
|
|
55612e262b | ||
|
|
a10c51aebf | ||
|
|
b3ecf7141a | ||
|
|
cbb2390a2a |
25 changed files with 907 additions and 761 deletions
|
|
@ -167,9 +167,18 @@ def list_scenes(
|
||||||
if q:
|
if q:
|
||||||
base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))
|
base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))
|
||||||
|
|
||||||
|
# Cap rozmiarów filtrów. Bez tego pojedynczy request z setkami studio_slugs +
|
||||||
|
# dziesiątkami tagów (każdy tag = osobny correlated EXISTS) + ILIKE budował zapytanie,
|
||||||
|
# które OOM-killer ubijał → PG crash-recovery = ~1s globalnej przerwy (GOON-1M,
|
||||||
|
# 2026-06-26: 194 studios + 23 tagi). Realny UI nigdy nie wysyła tylu. 422 zamiast
|
||||||
|
# wywalania bazy. Limity hojne (>> normalne użycie), ale ograniczają złożoność query.
|
||||||
|
_MAX_STUDIOS, _MAX_TAGS, _MAX_PERFORMERS = 50, 15, 15
|
||||||
|
|
||||||
studio_slug_list = _split_csv(studio_slugs)
|
studio_slug_list = _split_csv(studio_slugs)
|
||||||
if studio_slug:
|
if studio_slug:
|
||||||
studio_slug_list.append(studio_slug)
|
studio_slug_list.append(studio_slug)
|
||||||
|
if len(studio_slug_list) > _MAX_STUDIOS:
|
||||||
|
raise HTTPException(status_code=422, detail=f"too many studio filters (max {_MAX_STUDIOS})")
|
||||||
if studio_slug_list:
|
if studio_slug_list:
|
||||||
base = base.where(
|
base = base.where(
|
||||||
Scene.studio_id.in_(
|
Scene.studio_id.in_(
|
||||||
|
|
@ -178,6 +187,8 @@ def list_scenes(
|
||||||
)
|
)
|
||||||
|
|
||||||
tag_slug_list = _split_csv(tags)
|
tag_slug_list = _split_csv(tags)
|
||||||
|
if len(tag_slug_list) > _MAX_TAGS:
|
||||||
|
raise HTTPException(status_code=422, detail=f"too many tag filters (max {_MAX_TAGS})")
|
||||||
# AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
|
# AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
|
||||||
# exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
|
# exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
|
||||||
#
|
#
|
||||||
|
|
@ -207,6 +218,8 @@ def list_scenes(
|
||||||
)
|
)
|
||||||
|
|
||||||
perf_id_strings = _split_csv(performer_ids)
|
perf_id_strings = _split_csv(performer_ids)
|
||||||
|
if len(perf_id_strings) > _MAX_PERFORMERS:
|
||||||
|
raise HTTPException(status_code=422, detail=f"too many performer filters (max {_MAX_PERFORMERS})")
|
||||||
if perf_id_strings:
|
if perf_id_strings:
|
||||||
try:
|
try:
|
||||||
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
|
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
|
||||||
|
|
@ -969,7 +982,7 @@ def enrich_tags_from_tube(
|
||||||
|
|
||||||
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
||||||
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "pornhatcom"]
|
"xvideoscom", "xnxxcom"]
|
||||||
sources = session.execute(
|
sources = session.execute(
|
||||||
select(PlaybackSource).where(
|
select(PlaybackSource).where(
|
||||||
PlaybackSource.scene_id == scene_id,
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,6 @@ _DISPLAY_OVERRIDES: dict[str, str] = {
|
||||||
"porn00org": "porn00.org",
|
"porn00org": "porn00.org",
|
||||||
"freshpornoorg": "freshporno.org",
|
"freshpornoorg": "freshporno.org",
|
||||||
"pornxpph": "pornxp.ph",
|
"pornxpph": "pornxp.ph",
|
||||||
"0dayxxcom": "0dayxx.com",
|
|
||||||
"shyfapnet": "shyfap.net",
|
"shyfapnet": "shyfap.net",
|
||||||
"hdporngg": "hdporn.gg",
|
"hdporngg": "hdporn.gg",
|
||||||
"fullmoviesxxx": "fullmovies.xxx",
|
"fullmoviesxxx": "fullmovies.xxx",
|
||||||
|
|
|
||||||
|
|
@ -349,7 +349,7 @@ _TAG_RESCRAPE_THRESHOLD = 3
|
||||||
# Mainstream tubes priority dla tagów — bogate metadane.
|
# Mainstream tubes priority dla tagów — bogate metadane.
|
||||||
_TAG_PRIORITY = [
|
_TAG_PRIORITY = [
|
||||||
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
"xvideoscom", "xnxxcom", "pornhatcom",
|
"xvideoscom", "xnxxcom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,12 +32,14 @@ from app.connectors.direct_scrapers.hqporner import HQPornerScraper
|
||||||
from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper
|
from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper
|
||||||
from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper
|
from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper
|
||||||
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
||||||
|
from app.connectors.direct_scrapers.mypornerleak_browse import MyPornerLeakBrowseScraper
|
||||||
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
||||||
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||||
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
|
||||||
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||||
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
|
||||||
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||||
|
from app.connectors.direct_scrapers.porntrex_browse import PornTrexBrowseScraper
|
||||||
|
from app.connectors.direct_scrapers.xnxx_browse import XnxxBrowseScraper
|
||||||
|
from app.connectors.direct_scrapers.youporn_browse import YouPornBrowseScraper
|
||||||
from app.connectors.direct_scrapers.siska import SiskaScraper
|
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||||
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||||
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||||
|
|
@ -48,7 +50,6 @@ from app.connectors.direct_scrapers.xnxx import XnxxScraper
|
||||||
from app.connectors.direct_scrapers.xvideos import XVideosScraper
|
from app.connectors.direct_scrapers.xvideos import XVideosScraper
|
||||||
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
|
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
|
||||||
from app.connectors.direct_scrapers.youporn import YouPornScraper
|
from app.connectors.direct_scrapers.youporn import YouPornScraper
|
||||||
from app.connectors.direct_scrapers.zerodayxx import ZeroDayXXScraper
|
|
||||||
|
|
||||||
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# Existing 4 (verified, in production)
|
# Existing 4 (verified, in production)
|
||||||
|
|
@ -58,12 +59,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
|
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
|
||||||
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
|
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
|
||||||
SxyLandScraper,
|
SxyLandScraper,
|
||||||
# ZeroDayXXScraper — wyłączony 2026-05-12 (source quality report): 25,596 scen, 0.1% canonical
|
# ZeroDayXXScraper (0dayxx) — USUNIĘTY CAŁKOWICIE 2026-06-22 (user request). Orphan
|
||||||
# match. Slug-concat tytuły (`bella reese big butt ready to be filled with cum analized`) bez
|
# factory (0.1% canonical), zastępujemy lepszymi źródłami. Dane/pliki/extractor skasowane.
|
||||||
# `[Studio]` lub `Studio - Perf - Title` prefixu (parse rate 3%) → resolver nie ma żadnego
|
|
||||||
# signalu do matchu. Wraps watchporn ale dziedziczy stripped metadata. Solo orphany usunięte
|
|
||||||
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
|
||||||
# resolvują).
|
|
||||||
# Mainstream (URL templates well-known)
|
# Mainstream (URL templates well-known)
|
||||||
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
# PornHub + RedTube — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request). Disabled od
|
||||||
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
# 2026-05-12 (0.4% canonical match), zamrożone dane skasowane z DB, pliki scraperów
|
||||||
|
|
@ -91,7 +88,9 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL =
|
# only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL =
|
||||||
# streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346
|
# streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346
|
||||||
# solo-orphan scen.
|
# solo-orphan scen.
|
||||||
PornDishScraper,
|
# PornDishScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-24,
|
||||||
|
# watchdog GOON-16: search `?s=` zamarzł 2026-05-07). WordPress → browse przez WP REST
|
||||||
|
# API (/wp-json/wp/v2/posts) jak perverzija: tytuł/data/thumb/studio(category)/tagi.
|
||||||
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
||||||
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
||||||
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
||||||
|
|
@ -105,22 +104,14 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
|
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
|
||||||
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
|
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
|
||||||
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
|
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
|
||||||
# PornHatScraper — wyłączony 2026-05-18. 9,799 scen, 0.2% canonical match, 100% solo-orphan.
|
# PornHat (pornhatcom) + PornDitt (porndittcom) — USUNIĘTE CAŁKOWICIE 2026-06-22
|
||||||
# Pure orphan factory — auto-screenshot thumbs nie matchują phash do canonical, slug tytuły
|
# (user request). Orphan factories (0.2% / weak-signal canonical match), zastępujemy
|
||||||
# nie matchują rapidfuzz, brak duration/date signals. KEEP `pornhatcom` extractor i istniejące
|
# lepszymi źródłami. Dane/pliki scraperów/extractory skasowane.
|
||||||
# playback_sources żywe — mobile może je odtwarzać; disable tylko future ingest.
|
|
||||||
# PornDittScraper — wyłączony 2026-05-12 (bug-report 64356e9b). Każdy link
|
|
||||||
# produkował nową Scene row zamiast matchować do istniejącej kanonicznej
|
|
||||||
# (TPDB/StashDB) bo pornditt ma weak signal: title + cz. performera, brak
|
|
||||||
# fingerprintu/duration/date → composite_score zawsze poniżej auto_merge
|
|
||||||
# threshold (0.92). Plik scrapera + extractor zostają (istniejące playback_sources
|
|
||||||
# nadal się resolvują, _REGISTRY w app/extractors/__init__.py odpala
|
|
||||||
# `porndittcom` → _embed_iframe.extract). Re-enable wymaga albo
|
|
||||||
# "alternative-source mode" w resolverze (match-only, never create new),
|
|
||||||
# albo bogatszej extracji metadanych (duration + fingerprint).
|
|
||||||
# Special
|
# Special
|
||||||
SxyPrnScraper,
|
SxyPrnScraper,
|
||||||
PerverzijaScraper,
|
# PerverzijaScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
|
||||||
|
# user request). Search `?s=` → 429, homepage JS-renderowane; browse przez WP REST API
|
||||||
|
# (/wp-json/wp/v2/posts) daje tytuł/datę/thumb/studio(category)/tagi. Playback embed-iframe.
|
||||||
# FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
|
# FpoxxxScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
|
||||||
# user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z
|
# user request). fpo.xxx to KVS, nie WordPress → search `?s=` zwracał 0; browse z
|
||||||
# `/new-<n>/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS).
|
# `/new-<n>/` daje listing tile (tytuł/thumb/duration). Playback i tak phone-side (KVS).
|
||||||
|
|
@ -145,14 +136,21 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F
|
||||||
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
|
|
||||||
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
|
|
||||||
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
|
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper # noqa: E402
|
||||||
|
|
||||||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
|
PerverzijaScraper,
|
||||||
|
PornDishScraper,
|
||||||
|
# Browse równolegle do istniejącego search scrapera (wzorzec xvideos/eporner):
|
||||||
|
# search zostaje (pokrycie back-catalogu performerów), browse gwarantuje świeżość
|
||||||
|
# wprost z feedu (watchdog 48h zamiast 168h). Konwersja 2026-06-24 (user request).
|
||||||
|
PornTrexBrowseScraper,
|
||||||
|
MyPornerLeakBrowseScraper,
|
||||||
|
YouPornBrowseScraper,
|
||||||
|
XnxxBrowseScraper,
|
||||||
FreshpornoScraper,
|
FreshpornoScraper,
|
||||||
FpoxxxScraper,
|
FpoxxxScraper,
|
||||||
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||||
|
|
@ -211,17 +209,12 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
|
# Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
|
||||||
# full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
|
# full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
|
||||||
XVideosBrowseScraper,
|
XVideosBrowseScraper,
|
||||||
# HQFapScraper / FourK69Scraper — WYŁĄCZONE 2026-06-22 (user request, na razie).
|
# HQFapScraper / FourK69Scraper — USUNIĘTE CAŁKOWICIE 2026-06-25. Oba PlayTube CMS;
|
||||||
# Oba na PlayTube CMS, ingestowały świeżo i wyglądały żywo, ALE playback w obu padł:
|
# disabled 2026-06-22 gdy playback padł, re-check 2026-06-25 potwierdził że CAŁA
|
||||||
# - hqfap: hosting migrował na `/upload/videos/video_down.mp4` = STAŁY ~3MB stub
|
# biblioteka CDN znikła: wide-sample przez pełny zakres id (hqfap 0/80 real, 4k69
|
||||||
# "server down" dla KAŻDEJ sceny (extractor go odrzuca → None),
|
# 0/40 real) — każda scena serwuje stały `/upload/videos/video_down.mp4` "server
|
||||||
# - 4k69: get_file nie zwraca już grywalnego URL (extractor resolves nothing → None).
|
# down" stub, nie realny plik. Dane (28k solo-orphan scen + 46k sources) skasowane
|
||||||
# Scena bez grywalnego źródła = śmieciowy wpis, więc nie ingestujemy nowych. Istniejące
|
# z DB, pliki scraperów/extractorów i wpisy w _REGISTRY usunięte.
|
||||||
# live playback_sources oznaczone dead na prodzie (znikają z /sources + has_playback).
|
|
||||||
# Reversible: odkomentuj + odżyw sources gdy hosting wróci. Extractory zostają w
|
|
||||||
# _REGISTRY (hqfapcom/4k69com) — gotowe gdyby content wrócił.
|
|
||||||
# HQFapScraper,
|
|
||||||
# FourK69Scraper,
|
|
||||||
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
|
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
|
||||||
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
|
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
|
||||||
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa
|
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa
|
||||||
|
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
"""4k69.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py).
|
|
||||||
|
|
||||||
Dołączony 2026-06-10 (user request; probe 2026-06-01 odrzucił po stronie głównej
|
|
||||||
"JS-rendered" — błędnie, scene pages mają pełny SSR + JSON-LD). 7 video sitemapów
|
|
||||||
≈ ~65k scen, content w dużej mierze studyjny (paysite re-upload, 4K).
|
|
||||||
|
|
||||||
Specyfika vs baza: studio NIE ma własnego pola na scenie — nazwy studiów występują
|
|
||||||
jako kategorie ("21 Sextury", "Adult Time") obok zwykłych ("Anal", "4K").
|
|
||||||
Klasyfikacja: lista wszystkich studiów z `/studios` (fetch raz per instancję,
|
|
||||||
match po znormalizowanej nazwie alfanumerycznej — pill "Adult Time" vs slug
|
|
||||||
"AdultTime"). Studio bywa też w prefiksie tytułu, ale kategoria jest pewniejsza.
|
|
||||||
|
|
||||||
Playback: JSON-LD contentUrl + dwa dodatkowe get_file w HTML (2160m/720m/480m,
|
|
||||||
www.4kporno.xxx) — ta sama platforma co fullmovies/hdporngg: get_file binduje CDN
|
|
||||||
do IP fetchera, więc oddajemy NIEZRESOLWOWANE (mobile_direct), telefon follow-uje
|
|
||||||
302 z własnym IP. Extractor `4k69com` pomija 2160p (CDN time-out, jak fpvcdn).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper
|
|
||||||
from app.extractors import browser_get
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_STUDIO_LINK_RE = re.compile(r"href=['\"][^'\"]*/videos/studio/([^'\"]+)['\"]", re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def _norm(name: str) -> str:
|
|
||||||
"""`Adult Time` / `AdultTime` → `adulttime` (porównanie pill vs studio slug)."""
|
|
||||||
return re.sub(r"[^a-z0-9]", "", name.lower())
|
|
||||||
|
|
||||||
|
|
||||||
class FourK69Scraper(BasePlayTubeScraper):
|
|
||||||
sitetag = "4k69com"
|
|
||||||
base_url = "https://4k69.com"
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self._studio_set: set[str] | None = None
|
|
||||||
|
|
||||||
def _load_studio_set(self) -> set[str]:
|
|
||||||
"""Znormalizowane nazwy wszystkich studiów z /studios. Pusty set = fetch
|
|
||||||
fail (graceful: sceny pójdą bez studio, composite ma performer+title+dur)."""
|
|
||||||
if self._studio_set is not None:
|
|
||||||
return self._studio_set
|
|
||||||
try:
|
|
||||||
r = browser_get(f"{self.base_url}/studios", timeout=self._timeout)
|
|
||||||
r.raise_for_status()
|
|
||||||
self._studio_set = {_norm(m) for m in _STUDIO_LINK_RE.findall(r.text) if _norm(m)}
|
|
||||||
log.info("4k69: studio list loaded — %d studios", len(self._studio_set))
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("4k69: studios page fetch failed: %s", e)
|
|
||||||
self._studio_set = set()
|
|
||||||
return self._studio_set
|
|
||||||
|
|
||||||
def _pick_studio(self, category_names: list[str]) -> str | None:
|
|
||||||
studios = self._load_studio_set()
|
|
||||||
if not studios:
|
|
||||||
return None
|
|
||||||
for name in category_names:
|
|
||||||
if _norm(name) in studios:
|
|
||||||
return name
|
|
||||||
return None
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
"""hqfap.com — latest-vids browse scraper (PlayTube CMS, patrz _playtube.py).
|
|
||||||
|
|
||||||
Dołączony 2026-06-10 (user request). Re-uploader katalogu pornhd.pet (~120k scen,
|
|
||||||
thumbnaile to base64-encoded oryginalne URL-e w `/uploads/images/`).
|
|
||||||
|
|
||||||
Specyfika vs baza: studio siedzi w kategoriach z suffixem " Clips"
|
|
||||||
("Filthy Kings Clips" → studio "Filthy Kings"); reszta kategorii → tagi.
|
|
||||||
Playback: direct mp4 z JSON-LD contentUrl (cdnde.com nowsze / okcdn.ru starsze),
|
|
||||||
tokeny time-bound i portable cross-IP → natywny extractor `hqfapcom`.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._playtube import BasePlayTubeScraper
|
|
||||||
|
|
||||||
|
|
||||||
class HQFapScraper(BasePlayTubeScraper):
|
|
||||||
sitetag = "hqfapcom"
|
|
||||||
base_url = "https://hqfap.com"
|
|
||||||
|
|
||||||
def _pick_studio(self, category_names: list[str]) -> str | None:
|
|
||||||
for name in category_names:
|
|
||||||
if name.lower().endswith(" clips"):
|
|
||||||
studio_name = name[: -len(" clips")].strip()
|
|
||||||
if studio_name:
|
|
||||||
return studio_name
|
|
||||||
return None
|
|
||||||
150
app/connectors/direct_scrapers/mypornerleak_browse.py
Normal file
150
app/connectors/direct_scrapers/mypornerleak_browse.py
Normal file
|
|
@ -0,0 +1,150 @@
|
||||||
|
"""mypornerleak.com — latest BROWSE scraper via WordPress REST API, obok search scrapera.
|
||||||
|
|
||||||
|
MyPornerLeakScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada
|
||||||
|
świeżość wprost z WP REST (`/wp-json/wp/v2/posts?_embed=1`). W odróżnieniu od
|
||||||
|
perverzija/porndish, mypornerleak WYSTAWIA custom taksonomię `actors` w REST →
|
||||||
|
mamy też performerów (nie tylko studio z `category` + tagi z `post_tag`).
|
||||||
|
|
||||||
|
Playback: post page embeduje hoster iframe → extractor `mypornerleakcom` →
|
||||||
|
`_embed_iframe`, resolwowany phone-side (bez zmian).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://mypornerleak.com"
|
||||||
|
_PER_PAGE = 20
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class MyPornerLeakBrowseScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "mypornerleakcom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
url = self._listing_url(page)
|
||||||
|
try:
|
||||||
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("mypornerleak REST fetch failed (page %d): %s", page, e)
|
||||||
|
return None
|
||||||
|
if res.status_code != 200:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
posts = json.loads(res.text)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
log.warning("mypornerleak REST: bad JSON page %d", page)
|
||||||
|
return None
|
||||||
|
if not isinstance(posts, list) or not posts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
out: list[RawScene] = []
|
||||||
|
for p in posts:
|
||||||
|
link = (p.get("link") or "").strip()
|
||||||
|
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
||||||
|
if not link or not title:
|
||||||
|
continue
|
||||||
|
release_date = _parse_date(p.get("date"))
|
||||||
|
|
||||||
|
emb = p.get("_embedded") or {}
|
||||||
|
fm = emb.get("wp:featuredmedia") or []
|
||||||
|
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for group in emb.get("wp:term") or []:
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
tax = group[0].get("taxonomy")
|
||||||
|
if tax == "category" and studio is None:
|
||||||
|
sname = (group[0].get("name") or "").strip()
|
||||||
|
if sname:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
||||||
|
name=sname, slug=slugify(sname),
|
||||||
|
)
|
||||||
|
elif tax == "actors":
|
||||||
|
for g in group:
|
||||||
|
name = (g.get("name") or "").strip()
|
||||||
|
sl = slugify(name)
|
||||||
|
if not name or sl in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(sl)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=name)
|
||||||
|
)
|
||||||
|
elif tax == "post_tag":
|
||||||
|
for g in group:
|
||||||
|
name = (g.get("name") or "").strip()
|
||||||
|
sl = (g.get("slug") or slugify(name)).strip()
|
||||||
|
if not name or sl in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(sl)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{link}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
url=link,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=link,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
log.info("mypornerleak REST page %d: %d scenes", page, len(out))
|
||||||
|
return out
|
||||||
|
|
@ -1,21 +1,148 @@
|
||||||
"""perverzija.com — direct HTML scrape search results.
|
"""perverzija.com — latest browse scraper via WordPress REST API.
|
||||||
|
|
||||||
Search: `https://www.perverzija.com/page/<n>/?s=<q>` (WordPress + Cloudflare).
|
Historia: dawniej search scraper (`?s=`), ale 2026-06 perverzija rate-limituje search
|
||||||
Scene URL: `https://www.perverzija.com/<slug>/`.
|
(429) a homepage jest JS-renderowane (brak linków postów w surowym HTML) → search
|
||||||
|
zwracał 0. To WordPress, więc czysty kanał to REST API: `/wp-json/wp/v2/posts` daje
|
||||||
|
ustrukturyzowany JSON (link, date, title, featured thumb, taksonomie) jednym requestem
|
||||||
|
na stronę. VPS dociera (curl_cffi bypassuje JA3; 200 nie 403). Przerobione na browse
|
||||||
|
2026-06-22 (user request).
|
||||||
|
|
||||||
CF-protected: `browser_get` (curl_cffi) bypassuje JA3 fingerprint blocks.
|
Z REST `?_embed=1` bierzemy: tytuł, datę, miniaturę (featured_media), STUDIO
|
||||||
|
(taksonomia `category` — np. "DadCrush"/"TeamSkeet", to studyjny re-up) i tagi
|
||||||
|
(`post_tag`). Performerów REST nie wystawia (custom taksonomia `stars` bez show_in_rest)
|
||||||
|
→ puste, dorabia canonical-merge (content studyjny dobrze matchuje TPDB/StashDB; tytuł
|
||||||
|
i tak ma nazwiska).
|
||||||
|
|
||||||
|
Playback: post page (tube.perverzija.com/<slug>/) embeduje xtremestream iframe →
|
||||||
|
extractor `perverzijacom` → `_embed_iframe` → hoster resolwowany phone-side.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import html
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.perverzija.com"
|
||||||
|
_PER_PAGE = 20
|
||||||
|
|
||||||
|
|
||||||
class PerverzijaScraper(BaseSearchScraper):
|
def _parse_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class PerverzijaScraper(BaseBrowseScraper):
|
||||||
sitetag = "perverzijacom"
|
sitetag = "perverzijacom"
|
||||||
_search_url_template = "https://www.perverzija.com/page/{page}/?s={query}"
|
|
||||||
_scene_url_re = re.compile(
|
def _listing_url(self, page: int) -> str:
|
||||||
r'href="(?P<url>https://www\.perverzija\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
# crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane.
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
url = self._listing_url(page)
|
||||||
|
try:
|
||||||
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("perverzija REST fetch failed (page %d): %s", page, e)
|
||||||
|
return None
|
||||||
|
# WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted.
|
||||||
|
if res.status_code != 200:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
posts = json.loads(res.text)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
log.warning("perverzija REST: bad JSON page %d", page)
|
||||||
|
return None
|
||||||
|
if not isinstance(posts, list) or not posts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
out: list[RawScene] = []
|
||||||
|
for p in posts:
|
||||||
|
link = (p.get("link") or "").strip()
|
||||||
|
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
||||||
|
if not link or not title:
|
||||||
|
continue
|
||||||
|
release_date = _parse_date(p.get("date"))
|
||||||
|
|
||||||
|
emb = p.get("_embedded") or {}
|
||||||
|
fm = emb.get("wp:featuredmedia") or []
|
||||||
|
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for group in emb.get("wp:term") or []:
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
tax = group[0].get("taxonomy")
|
||||||
|
if tax == "category" and studio is None:
|
||||||
|
sname = (group[0].get("name") or "").strip()
|
||||||
|
if sname:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
||||||
|
name=sname, slug=slugify(sname),
|
||||||
|
)
|
||||||
|
elif tax == "post_tag":
|
||||||
|
for g in group:
|
||||||
|
name = (g.get("name") or "").strip()
|
||||||
|
sl = (g.get("slug") or slugify(name)).strip()
|
||||||
|
if not name or sl in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(sl)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{link}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
url=link,
|
||||||
|
studio=studio,
|
||||||
|
performers=[],
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=link,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("perverzija REST page %d: %d scenes", page, len(out))
|
||||||
|
return out
|
||||||
|
|
|
||||||
|
|
@ -1,116 +1,147 @@
|
||||||
"""porndish.com — direct HTML scrape.
|
"""porndish.com — latest browse scraper via WordPress REST API.
|
||||||
|
|
||||||
Search: `https://porndish.com/page/<n>/?s=<q>`.
|
Historia: dawniej search scraper (`?s=`), zamarzł 2026-05-07 (search przestał dawać
|
||||||
Scene URL: `https://porndish.com/<slug>/`.
|
nowe sceny — 1151h cisza, watchdog GOON-16). To WordPress (g1/bimber theme), VPS
|
||||||
|
dociera, więc czysty kanał to REST API: `/wp-json/wp/v2/posts?_embed=1` daje
|
||||||
|
ustrukturyzowany JSON jednym requestem na stronę. Przerobione na browse 2026-06-24
|
||||||
|
(ten sam wzorzec co perverzija).
|
||||||
|
|
||||||
Scene detail page (g1/bimber WordPress theme) zawiera:
|
Z REST `_embed`: tytuł, data, miniatura (featured_media), STUDIO (taksonomia
|
||||||
- `<p class="entry-tags"><a class="entry-tag entry-tag-N" href=".../video2/<slug>/">Name</a>…`
|
`category` — np. "Freeuse Fantasy", content studyjny) i tagi (`post_tag` — porndish
|
||||||
— lista tagów (kategorie + performerzy wymieszani, tak jak porndish je pokazuje
|
miesza w nich performerów z gatunkami, bierzemy jak jest; canonical-merge i tak
|
||||||
jako „#" hashtagi). Bierzemy wszystkie jako RawTag (resolver dedupuje; performer
|
dorabia performerów z TPDB/StashDB, a tytuł ma nazwiska). Performerów osobno nie
|
||||||
z query i tak dochodzi osobno).
|
wyciągamy (post_tag ich nie rozdziela od gatunków bez listy known-performers).
|
||||||
- prozę opisu w `<p>` wewnątrz `.entry-content` (przed `entry-tags`, po embed-JS).
|
|
||||||
Bez `_fetch_scene_metadata` overrides scena z samego porndish miała 0 tagów i brak
|
Playback: post page embeduje hoster iframe → extractor `porndishcom` → `_embed_iframe`
|
||||||
description (bug-report od Jana 2026-06-06: „nie ma tagów (# na stronie) ani description").
|
→ resolwowany phone-side.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import html as html_mod
|
import html
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
from datetime import date, datetime
|
||||||
|
|
||||||
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
from app.connectors.base import (
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
RawFingerprint,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
from app.extractors import browser_get
|
from app.extractors import browser_get
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
_ENTRY_TAG_RE = re.compile(
|
_BASE = "https://www.porndish.com"
|
||||||
r'<a[^>]+href="[^"]*/video2/(?P<slug>[^"/]+)/"[^>]*class="[^"]*entry-tag[^"]*"[^>]*>'
|
_PER_PAGE = 20
|
||||||
r'(?P<name>[^<]+)</a>',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
_ENTRY_CONTENT_RE = re.compile(
|
|
||||||
r'<div[^>]*class="[^"]*entry-content[^"]*"[^>]*>(?P<body>.*?)</article>',
|
|
||||||
re.IGNORECASE | re.DOTALL,
|
|
||||||
)
|
|
||||||
_SCRIPT_STYLE_RE = re.compile(r"<script\b.*?</script>|<style\b.*?</style>", re.IGNORECASE | re.DOTALL)
|
|
||||||
_P_RE = re.compile(r"<p\b[^>]*>(?P<inner>.*?)</p>", re.IGNORECASE | re.DOTALL)
|
|
||||||
_TAG_STRIP_RE = re.compile(r"<[^>]+>")
|
|
||||||
_WS_RE = re.compile(r"\s+")
|
|
||||||
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
|
||||||
|
|
||||||
|
|
||||||
def _slugify(name: str) -> str:
|
def _parse_date(value: str | None) -> date | None:
|
||||||
return _SLUG_RE.sub("-", name.lower()).strip("-") or "tag"
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _clean_text(fragment: str) -> str:
|
class PornDishScraper(BaseBrowseScraper):
|
||||||
txt = _TAG_STRIP_RE.sub(" ", fragment)
|
|
||||||
txt = html_mod.unescape(txt)
|
|
||||||
return _WS_RE.sub(" ", txt).strip()
|
|
||||||
|
|
||||||
|
|
||||||
class PornDishScraper(BaseSearchScraper):
|
|
||||||
sitetag = "porndishcom"
|
sitetag = "porndishcom"
|
||||||
_search_url_template = "https://porndish.com/page/{page}/?s={query}"
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _fetch_scene_metadata(
|
def _listing_url(self, page: int) -> str:
|
||||||
self, scene_url: str
|
return f"{_BASE}/wp-json/wp/v2/posts?per_page={_PER_PAGE}&page={page}&_embed=1"
|
||||||
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag], str | None] | None:
|
|
||||||
"""Fetch scene page → (studio=None, performers=[], tags, description).
|
|
||||||
|
|
||||||
4-elementowy zwrot (base unpacka opcjonalny `description`). porndish nie
|
# crawl_page nadpisany (REST JSON, nie HTML) → abstrakcje nieużywane.
|
||||||
wyróżnia studia, a performer z query dochodzi w base — tu tylko tagi + opis.
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
"""
|
return []
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
url = self._listing_url(page)
|
||||||
try:
|
try:
|
||||||
r = browser_get(scene_url, timeout=self._timeout)
|
res = browser_get(url, timeout=self._timeout)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug("porndish meta fetch failed for %s: %s", scene_url, e)
|
log.warning("porndish REST fetch failed (page %d): %s", page, e)
|
||||||
return None
|
return None
|
||||||
if r.status_code != 200 or not r.text:
|
# WP zwraca 400 (rest_post_invalid_page_number) za ostatnią stroną → exhausted.
|
||||||
|
if res.status_code != 200:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
posts = json.loads(res.text)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
log.warning("porndish REST: bad JSON page %d", page)
|
||||||
return None
|
return None
|
||||||
html = r.text
|
if not isinstance(posts, list) or not posts:
|
||||||
|
return []
|
||||||
|
|
||||||
# Tagi: entry-tag anchors (slug z /video2/<slug>/ + display name).
|
out: list[RawScene] = []
|
||||||
tags: list[RawTag] = []
|
for p in posts:
|
||||||
seen: set[str] = set()
|
link = (p.get("link") or "").strip()
|
||||||
for m in _ENTRY_TAG_RE.finditer(html):
|
title = html.unescape((p.get("title") or {}).get("rendered", "")).strip()
|
||||||
name = html_mod.unescape(m.group("name")).strip()
|
if not link or not title:
|
||||||
slug = (m.group("slug") or "").strip().lower() or _slugify(name)
|
|
||||||
if not name or len(name) > 40 or slug in seen:
|
|
||||||
continue
|
continue
|
||||||
seen.add(slug)
|
release_date = _parse_date(p.get("date"))
|
||||||
tags.append(RawTag(external_id=f"porndishcom:tag:{slug}", name=name, slug=slug))
|
|
||||||
|
|
||||||
# Description: najdłuższy prozowy <p> w .entry-content (bez entry-tags / embed-JS).
|
emb = p.get("_embedded") or {}
|
||||||
description: str | None = None
|
fm = emb.get("wp:featuredmedia") or []
|
||||||
mc = _ENTRY_CONTENT_RE.search(html)
|
thumb = (fm[0].get("source_url") if fm and isinstance(fm[0], dict) else None) or None
|
||||||
body = mc.group("body") if mc else html
|
|
||||||
body = _SCRIPT_STYLE_RE.sub(" ", body)
|
|
||||||
best = ""
|
|
||||||
for pm in _P_RE.finditer(body):
|
|
||||||
inner = pm.group("inner")
|
|
||||||
if "entry-tag" in inner:
|
|
||||||
continue
|
|
||||||
txt = _clean_text(inner)
|
|
||||||
# Pomijamy resztki JS / boilerplate „Watch … porn video" / przyciski serwerów.
|
|
||||||
if not txt or "getElementById" in txt or "addEventListener" in txt:
|
|
||||||
continue
|
|
||||||
low = txt.lower()
|
|
||||||
if low.startswith("watch ") and low.endswith("porn video"):
|
|
||||||
continue
|
|
||||||
if len(txt) > len(best):
|
|
||||||
best = txt
|
|
||||||
# Strip wiodące etykiety przycisków embedu („Video Player 1 Video Player 2 …",
|
|
||||||
# czasem „Server N") które wpadają na początek prozy.
|
|
||||||
best = re.sub(r"^(?:Video Player \d+\s*|Server \d+\s*|Download\s*)+", "", best, flags=re.IGNORECASE).strip()
|
|
||||||
if len(best) >= 40:
|
|
||||||
description = best
|
|
||||||
|
|
||||||
if not tags and description is None:
|
studio: RawStudio | None = None
|
||||||
return None
|
tags: list[RawTag] = []
|
||||||
return (None, [], tags, description)
|
seen_tag: set[str] = set()
|
||||||
|
for group in emb.get("wp:term") or []:
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
tax = group[0].get("taxonomy")
|
||||||
|
if tax == "category" and studio is None:
|
||||||
|
sname = (group[0].get("name") or "").strip()
|
||||||
|
if sname:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{self.sitetag}:studio:{slugify(sname)}",
|
||||||
|
name=sname, slug=slugify(sname),
|
||||||
|
)
|
||||||
|
elif tax == "post_tag":
|
||||||
|
for g in group:
|
||||||
|
name = (g.get("name") or "").strip()
|
||||||
|
sl = (g.get("slug") or slugify(name)).strip()
|
||||||
|
if not name or sl in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(sl)
|
||||||
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=name, slug=sl))
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{link}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
url=link,
|
||||||
|
studio=studio,
|
||||||
|
performers=[],
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=link,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("porndish REST page %d: %d scenes", page, len(out))
|
||||||
|
return out
|
||||||
|
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
"""pornditt.com — direct HTML scrape.
|
|
||||||
|
|
||||||
KVS-style site (kt_player engine). Search URL: `/search/<slug>/?from=<page>` z slug-style
|
|
||||||
zapytaniem (spacje → `-`). Sceny renderują się na subdomenie `v.pornditt.com/videos/<id>/<slug>/`,
|
|
||||||
więc regex matchuje oba (z i bez `v.` prefix).
|
|
||||||
|
|
||||||
Sitetag `porndittcom` (legacy z porn-app DEFAULT_SITETAGS — suffix-stripped name).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
|
|
||||||
|
|
||||||
class PornDittScraper(BaseSearchScraper):
|
|
||||||
sitetag = "porndittcom"
|
|
||||||
_search_url_template = "https://pornditt.com/search/{query}/?from={page}"
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>https://(?:v\.)?pornditt\.com/videos/(?P<sid>\d+)/(?P<slug>[a-z0-9\-]+))/"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _format_query_for_url(self, query: str) -> str:
|
|
||||||
# KVS slug: lowercase, spacja/interpunkcja → `-`. URL-encoded (`+`) tu nie zadziała.
|
|
||||||
return re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-")
|
|
||||||
|
|
@ -1,99 +0,0 @@
|
||||||
"""pornhat.com — search-mode scraper (performer-driven backfill).
|
|
||||||
|
|
||||||
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
|
|
||||||
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
|
|
||||||
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
|
|
||||||
|
|
||||||
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
|
|
||||||
— do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
|
|
||||||
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
|
|
||||||
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
|
|
||||||
których nie mamy w TPDB jeszcze.
|
|
||||||
|
|
||||||
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
|
|
||||||
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
|
|
||||||
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
|
|
||||||
(models), i tagi do każdej sceny.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
|
||||||
from app.extractors import browser_get
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
|
|
||||||
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
|
|
||||||
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
|
|
||||||
_AJAX_DATA_RE = re.compile(
|
|
||||||
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PornHatScraper(BaseSearchScraper):
|
|
||||||
sitetag = "pornhatcom"
|
|
||||||
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
|
|
||||||
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
|
|
||||||
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
|
|
||||||
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
|
|
||||||
_scene_url_re = re.compile(
|
|
||||||
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _format_query_for_url(self, query: str) -> str:
|
|
||||||
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
|
|
||||||
return query.strip().lower().replace(" ", "-")
|
|
||||||
|
|
||||||
def _fetch_scene_metadata(
|
|
||||||
self, scene_url: str
|
|
||||||
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
|
||||||
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
|
|
||||||
try:
|
|
||||||
r = browser_get(scene_url, timeout=self._timeout)
|
|
||||||
if r.status_code != 200:
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
studio: RawStudio | None = None
|
|
||||||
performers: list[RawPerformer] = []
|
|
||||||
tags: list[RawTag] = []
|
|
||||||
|
|
||||||
for m in _AJAX_DATA_RE.finditer(r.text):
|
|
||||||
kind = m.group("kind").lower()
|
|
||||||
try:
|
|
||||||
data = json.loads(m.group("json"))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
name = (data.get("title") or "").strip()
|
|
||||||
slug = (data.get("dir") or "").strip() or None
|
|
||||||
if not name:
|
|
||||||
continue
|
|
||||||
if kind == "dvd":
|
|
||||||
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
|
|
||||||
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
|
|
||||||
if studio is None:
|
|
||||||
studio = RawStudio(
|
|
||||||
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
|
|
||||||
name=name,
|
|
||||||
slug=slug,
|
|
||||||
)
|
|
||||||
elif kind == "model":
|
|
||||||
performers.append(RawPerformer(name=name))
|
|
||||||
elif kind == "tag":
|
|
||||||
tags.append(RawTag(
|
|
||||||
external_id=f"pornhatcom:tag:{slug or name.lower()}",
|
|
||||||
name=name,
|
|
||||||
slug=slug,
|
|
||||||
))
|
|
||||||
|
|
||||||
return studio, performers, tags
|
|
||||||
122
app/connectors/direct_scrapers/porntrex_browse.py
Normal file
122
app/connectors/direct_scrapers/porntrex_browse.py
Normal file
|
|
@ -0,0 +1,122 @@
|
||||||
|
"""porntrex.com — latest-vids BROWSE scraper (KVS), obok istniejącego search scrapera.
|
||||||
|
|
||||||
|
PornTrexScraper (search, performer-driven) zostaje w ALL_DIRECT_SCRAPERS — daje
|
||||||
|
pokrycie back-catalogu performerów. Ten browse dokłada gwarancję świeżości wprost
|
||||||
|
z feedu `/latest-updates/<n>/` (próg watchdog 48h zamiast 168h, nie zależy od kolejki
|
||||||
|
performerów). Wzorzec jak xvideos (search + browse równolegle).
|
||||||
|
|
||||||
|
KVS listing tile:
|
||||||
|
<div ... data-item-id="<id>"><a href="https://www.porntrex.com/video/<id>/<slug>">
|
||||||
|
<img data-src="//ptx.cdntrex.com/contents/.../300x168/1.jpg" alt="<Tytuł>">
|
||||||
|
<div class="duration">MM:SS</div>
|
||||||
|
Playback: KVS, natywny extractor `porntrexcom` (token expires+md5, portable) — bez zmian.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.porntrex.com"
|
||||||
|
_A_RE = re.compile(
|
||||||
|
r'<a\s+href="(?P<url>https?://(?:www\.)?porntrex\.com/video/\d+/[^"]*)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_ALT_RE = re.compile(r'alt="([^"]*)"')
|
||||||
|
_THUMB_RE = re.compile(r'data-src="(//[^"]+\.(?:jpg|jpeg|webp|png)[^"]*)"', re.IGNORECASE)
|
||||||
|
_DUR_RE = re.compile(r'class="duration">\s*([\d]{1,2}(?:\s*:\s*[\d]{2}){1,2})\s*<')
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_duration(text: str | None) -> int | None:
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
nums = [int(p.strip()) for p in text.split(":")]
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if len(nums) == 2:
|
||||||
|
return nums[0] * 60 + nums[1]
|
||||||
|
if len(nums) == 3:
|
||||||
|
return nums[0] * 3600 + nums[1] * 60 + nums[2]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class PornTrexBrowseScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "porntrexcom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
return f"{_BASE}/latest-updates/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
return [m.group("url") for m in _A_RE.finditer(listing_html)]
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
|
url = self._listing_url(page)
|
||||||
|
try:
|
||||||
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
text = res.text if hasattr(res, "text") else res
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("porntrex browse fetch failed (page %d): %s", page, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
out: list[RawScene] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
anchors = list(_A_RE.finditer(text))
|
||||||
|
for idx, m in enumerate(anchors):
|
||||||
|
scene_url = m.group("url").replace("://www.", "://").rstrip("/")
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
win = text[m.start(): (anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 700)]
|
||||||
|
|
||||||
|
am = _ALT_RE.search(win)
|
||||||
|
title = html.unescape(am.group(1)).strip() if am else ""
|
||||||
|
if not title:
|
||||||
|
# fallback: slug → tytuł
|
||||||
|
sl = re.search(r"/video/\d+/([a-z0-9\-]+)", scene_url)
|
||||||
|
title = sl.group(1).replace("-", " ").strip().title() if sl else ""
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
tm = _THUMB_RE.search(win)
|
||||||
|
thumb = ("https:" + tm.group(1)) if tm else None
|
||||||
|
dm = _DUR_RE.search(win)
|
||||||
|
duration_sec = _parse_duration(dm.group(1) if dm else None)
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
performers=[],
|
||||||
|
tags=[],
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
log.info("porntrex browse page %d: %d scenes", page, len(out))
|
||||||
|
return out
|
||||||
129
app/connectors/direct_scrapers/xnxx_browse.py
Normal file
129
app/connectors/direct_scrapers/xnxx_browse.py
Normal file
|
|
@ -0,0 +1,129 @@
|
||||||
|
"""xnxx.com — BROWSE scraper (JSON-LD), obok search scrapera.
|
||||||
|
|
||||||
|
Detail page ma JSON-LD VideoObject (name/duration/uploadDate/thumbnail) — i TYLKO to
|
||||||
|
bierzemy. Mimo wspólnego silnika z xvideos, xnxx detail NIE wystawia w SSR linków
|
||||||
|
`/models/` ani `/tags/` (0 wystąpień, ładowane JS-em) → performerów/tagi dorabia
|
||||||
|
canonical-merge + istniejący performer-search. XnxxScraper (search) zostaje; browse
|
||||||
|
dokłada sygnał świeżości. Tytuł z JSON-LD bywa HTML-encoded (`,`/`!`) →
|
||||||
|
html.unescape.
|
||||||
|
|
||||||
|
Listing: xnxx NIE ma czystego SSR `/new/` (404), ale `/best/<YYYY-MM>/<page>` jest
|
||||||
|
SSR (linki /video-<id>/ w surowym HTML). Bierzemy bieżący miesiąc — pokrywa świeży
|
||||||
|
content (sortowanie best-of-month, nie ściśle chronologiczne, ale dla sygnału
|
||||||
|
świeżości wystarcza; ścisłą chronologię i tak daje performer-search). Homepage
|
||||||
|
xnxx jest JS-renderowany (0 linków w surowym HTML), stąd /best/.
|
||||||
|
|
||||||
|
Playback bez zmian (extractor `xnxxcom`). Phash pominięty (xnxx crop-thumbnaile,
|
||||||
|
0% hit do canonical — jak xvideos).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.xnxx.com"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(/video-[a-z0-9]+/[a-z0-9_\-]+)"', re.IGNORECASE)
|
||||||
|
_JSONLD_RE = re.compile(
|
||||||
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
_SETTITLE_RE = re.compile(r"html5player\.setVideoTitle\('([^']+)'\)")
|
||||||
|
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _dur_to_sec(value: str | None) -> int | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_RE.match(str(value).strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
def _iso_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
|
||||||
|
return date.fromisoformat(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _video_object(html: str) -> dict | None:
|
||||||
|
for m in _JSONLD_RE.finditer(html):
|
||||||
|
raw = m.group(1).strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
|
||||||
|
for obj in items:
|
||||||
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
||||||
|
return obj
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class XnxxBrowseScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "xnxxcom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
month = datetime.now().strftime("%Y-%m")
|
||||||
|
return f"{_BASE}/best/{month}/{page}"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = f"{_BASE}{m.group(1)}"
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _video_object(detail_html) or {}
|
||||||
|
title = (video.get("name") or "").strip()
|
||||||
|
if not title:
|
||||||
|
m = _SETTITLE_RE.search(detail_html)
|
||||||
|
title = m.group(1).strip() if m else (meta_content(detail_html, property="og:title") or "").strip()
|
||||||
|
title = html.unescape(title).strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_sec = _dur_to_sec(video.get("duration"))
|
||||||
|
release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
|
||||||
|
thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
|
||||||
|
if isinstance(thumbnail_url, list):
|
||||||
|
thumbnail_url = thumbnail_url[0] if thumbnail_url else None
|
||||||
|
|
||||||
|
# performers/tags puste — xnxx detail nie ma ich w SSR (patrz docstring).
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
performers=[],
|
||||||
|
tags=[],
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
raw={"source": "xnxx_browse"},
|
||||||
|
)
|
||||||
118
app/connectors/direct_scrapers/youporn_browse.py
Normal file
118
app/connectors/direct_scrapers/youporn_browse.py
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
"""youporn.com — latest BROWSE scraper (JSON-LD + page-parse), obok search scrapera.
|
||||||
|
|
||||||
|
YouPornScraper (search) zostaje w ALL_DIRECT_SCRAPERS; ten browse dokłada świeżość
|
||||||
|
wprost z `/browse/time/?page=<n>` (newest-first, SSR). Detail page ma JSON-LD
|
||||||
|
VideoObject (name/duration/uploadDate/thumbnail) — i TYLKO to bierzemy.
|
||||||
|
|
||||||
|
UWAGA: performerów/tagów z detail-strony NIE wyciągamy. JSON-LD nie ma pola `actor`,
|
||||||
|
a linki `/pornstar/` i `/category/` na stronie są zaśmiecone sidebarem (popularne
|
||||||
|
pornstars/related) bez czystego scene-scoped kontenera — naiwny regex podpinał te
|
||||||
|
same 2 pornstars do KAŻDEJ sceny (mass-misattribution). Browse to tylko sygnał
|
||||||
|
świeżości (próg watchdog 48h); performerów/tagi dorabia canonical-merge + istniejący
|
||||||
|
search scraper (performer-driven). Listing SSR (/watch/<id>/); homepage JS-renderowany.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from app.connectors.base import RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper, meta_content
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.youporn.com"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(/watch/\d+[^"]*)"', re.IGNORECASE)
|
||||||
|
_JSONLD_RE = re.compile(
|
||||||
|
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
_ISO_DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _dur_to_sec(value: str | None) -> int | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
m = _ISO_DUR_RE.match(str(value).strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
total = int(m.group(1) or 0) * 3600 + int(m.group(2) or 0) * 60 + int(m.group(3) or 0)
|
||||||
|
return total or None
|
||||||
|
|
||||||
|
|
||||||
|
def _iso_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(str(value).replace("Z", "+00:00")).date()
|
||||||
|
except ValueError:
|
||||||
|
m = re.match(r"(\d{4}-\d{2}-\d{2})", str(value))
|
||||||
|
return date.fromisoformat(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _video_object(html: str) -> dict | None:
|
||||||
|
for m in _JSONLD_RE.finditer(html):
|
||||||
|
raw = m.group(1).strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
items = data if isinstance(data, list) else (data.get("@graph", [data]) if isinstance(data, dict) else [])
|
||||||
|
for obj in items:
|
||||||
|
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
||||||
|
return obj
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class YouPornBrowseScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "youporncom"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
return f"{_BASE}/browse/time/?page={page}"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = f"{_BASE}{m.group(1)}"
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
video = _video_object(detail_html) or {}
|
||||||
|
title = (video.get("name") or "").strip() or (meta_content(detail_html, property="og:title") or "").strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_sec = _dur_to_sec(video.get("duration"))
|
||||||
|
release_date = _iso_date(video.get("uploadDate") or video.get("datePublished"))
|
||||||
|
thumbnail_url = video.get("thumbnailUrl") or meta_content(detail_html, property="og:image")
|
||||||
|
if isinstance(thumbnail_url, list):
|
||||||
|
thumbnail_url = thumbnail_url[0] if thumbnail_url else None
|
||||||
|
|
||||||
|
# performers/tags celowo puste — patrz docstring (sidebar pollution, brak
|
||||||
|
# scene-scoped kontenera). Dorabia canonical-merge + search scraper.
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
performers=[],
|
||||||
|
tags=[],
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
raw={"source": "youporn_browse"},
|
||||||
|
)
|
||||||
|
|
@ -1,119 +0,0 @@
|
||||||
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
|
|
||||||
|
|
||||||
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
|
|
||||||
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import urllib.parse
|
|
||||||
from collections.abc import Iterator
|
|
||||||
|
|
||||||
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
|
||||||
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
|
||||||
from app.extractors import browser_get
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
_SCENE_URL_RE = re.compile(
|
|
||||||
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
|
|
||||||
)
|
|
||||||
_OG_TITLE_RE = re.compile(
|
|
||||||
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
|
|
||||||
)
|
|
||||||
_OG_IMAGE_RE = re.compile(
|
|
||||||
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
|
|
||||||
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
|
|
||||||
|
|
||||||
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
|
|
||||||
nie są — siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
|
|
||||||
miniaturkę z poprawnym wymiarem (200x200 — mała, ale lepsza niż żadna).
|
|
||||||
|
|
||||||
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
|
|
||||||
bez thumbnail_url — czyli z dwoma najsłabszymi sygnałami na raz, co
|
|
||||||
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
|
|
||||||
2026-05-09).
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
r = browser_get(scene_url, timeout=20)
|
|
||||||
except Exception as e:
|
|
||||||
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
|
|
||||||
return None, None
|
|
||||||
if r.status_code != 200:
|
|
||||||
return None, None
|
|
||||||
title = None
|
|
||||||
thumb = None
|
|
||||||
if (m := _OG_TITLE_RE.search(r.text)):
|
|
||||||
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
|
|
||||||
title = m.group(1).split("|")[0].strip()
|
|
||||||
if (m := _OG_IMAGE_RE.search(r.text)):
|
|
||||||
thumb = m.group(1).strip()
|
|
||||||
return title, thumb
|
|
||||||
|
|
||||||
|
|
||||||
class ZeroDayXXScraper(BaseDirectTubeScraper):
|
|
||||||
sitetag = "0dayxxcom"
|
|
||||||
|
|
||||||
def search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
*,
|
|
||||||
page: int = 1,
|
|
||||||
limit: int | None = None,
|
|
||||||
) -> Iterator[RawScene]:
|
|
||||||
q = urllib.parse.quote_plus(query.strip())
|
|
||||||
url = f"https://0dayxx.com/page/{page}/?s={q}"
|
|
||||||
try:
|
|
||||||
r = browser_get(url, timeout=30)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("0dayxx search fetch failed: %s", e)
|
|
||||||
return
|
|
||||||
if r.status_code != 200:
|
|
||||||
return
|
|
||||||
|
|
||||||
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
|
||||||
yielded = 0
|
|
||||||
for m in _SCENE_URL_RE.finditer(r.text):
|
|
||||||
scene_url = m.group(1) + "/"
|
|
||||||
slug = m.group(2)
|
|
||||||
if scene_url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(scene_url)
|
|
||||||
|
|
||||||
slug_lower = slug.lower()
|
|
||||||
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
|
||||||
continue
|
|
||||||
|
|
||||||
real_title, thumb = _fetch_detail(scene_url)
|
|
||||||
title = real_title or slug.replace("-", " ").strip()
|
|
||||||
|
|
||||||
yield RawScene(
|
|
||||||
external_id=f"0dayxxcom:{scene_url}",
|
|
||||||
title=title,
|
|
||||||
url=scene_url,
|
|
||||||
playback_sources=[
|
|
||||||
RawPlaybackSource(
|
|
||||||
origin="tube:0dayxxcom",
|
|
||||||
page_url=scene_url,
|
|
||||||
thumbnail_url=thumb,
|
|
||||||
)
|
|
||||||
],
|
|
||||||
performers=[RawPerformer(name=query.strip())],
|
|
||||||
raw={
|
|
||||||
"source": "direct_scraper:0dayxx",
|
|
||||||
"query": query,
|
|
||||||
"page": page,
|
|
||||||
"url": scene_url,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
yielded += 1
|
|
||||||
if limit is not None and yielded >= limit:
|
|
||||||
return
|
|
||||||
|
|
@ -29,17 +29,13 @@ from app.extractors.tubes import (
|
||||||
_ytdlp,
|
_ytdlp,
|
||||||
eporner,
|
eporner,
|
||||||
freshporno,
|
freshporno,
|
||||||
fourk69,
|
|
||||||
fullmovies,
|
fullmovies,
|
||||||
hdporngg,
|
hdporngg,
|
||||||
hqfap,
|
|
||||||
hqporner,
|
hqporner,
|
||||||
neporn,
|
neporn,
|
||||||
latestpornvideo,
|
latestpornvideo,
|
||||||
paradisehill,
|
paradisehill,
|
||||||
porn00,
|
porn00,
|
||||||
pornditt,
|
|
||||||
pornhat,
|
|
||||||
porntrex,
|
porntrex,
|
||||||
sxyprn,
|
sxyprn,
|
||||||
xhamster,
|
xhamster,
|
||||||
|
|
@ -85,10 +81,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# flashvars `video_url` → `get_file` 302 → CDN time-bound signed URL
|
# flashvars `video_url` → `get_file` 302 → CDN time-bound signed URL
|
||||||
# (`expires`+`md5`, NIE IP-bound) → mobile gra direct, zero VPS bandwidth.
|
# (`expires`+`md5`, NIE IP-bound) → mobile gra direct, zero VPS bandwidth.
|
||||||
"porntrexcom": porntrex.extract,
|
"porntrexcom": porntrex.extract,
|
||||||
# pornditt — KVS jak yespornvip (function/0 + license). VPS dociera → resolve
|
|
||||||
# server-side (decode + follow 302 → portable twa.tgprn.com CDN). Wcześniej WebView
|
|
||||||
# fallback łapał VAST preroll (trafostatic) zamiast contentu. Patrz pornditt.py/_kvs.py.
|
|
||||||
"porndittcom": pornditt.extract,
|
|
||||||
# fpoxxx — KVS, plain get_file + license. 2026-06-01 (task #20): get_file 302 →
|
# fpoxxx — KVS, plain get_file + license. 2026-06-01 (task #20): get_file 302 →
|
||||||
# `videos3.fpo.xxx/remote_control.php?acctoken=<base64>` — zdekodowany acctoken
|
# `videos3.fpo.xxx/remote_control.php?acctoken=<base64>` — zdekodowany acctoken
|
||||||
# zawiera WBITY IP serwera-resolvera → definitywnie IP-bound. WebView only.
|
# zawiera WBITY IP serwera-resolvera → definitywnie IP-bound. WebView only.
|
||||||
|
|
@ -118,10 +110,6 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# ~155k solo-scen upgrade z WebView-z-reklamami na natywne. Wcześniej WebView fallback
|
# ~155k solo-scen upgrade z WebView-z-reklamami na natywne. Wcześniej WebView fallback
|
||||||
# ładował ad-heavy stronę z phone IP (działało, ale gorszy UX + preroll VAST).
|
# ładował ad-heavy stronę z phone IP (działało, ale gorszy UX + preroll VAST).
|
||||||
"xhamstercom": xhamster.extract,
|
"xhamstercom": xhamster.extract,
|
||||||
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
|
|
||||||
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
|
|
||||||
# follow_redirects=True wymagane (fix w stream_proxy.py).
|
|
||||||
"pornhatcom": pornhat.extract,
|
|
||||||
# Freshporno KVS (function/0 + license). 2026-06-04 DevTools + cross-IP re-test
|
# Freshporno KVS (function/0 + license). 2026-06-04 DevTools + cross-IP re-test
|
||||||
# NAPRAWIA błąd z #20: finalny cdn4.freshporno.org/remote_control.php jest PORTABLE
|
# NAPRAWIA błąd z #20: finalny cdn4.freshporno.org/remote_control.php jest PORTABLE
|
||||||
# (token time-bound nie IP-bound — VPS odtworzył token z residential → 206) ale
|
# (token time-bound nie IP-bound — VPS odtworzył token z residential → 206) ale
|
||||||
|
|
@ -152,9 +140,8 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# — używają identycznego embed-iframe pattern dla streamingu.
|
# — używają identycznego embed-iframe pattern dla streamingu.
|
||||||
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
|
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
|
||||||
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
|
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
|
||||||
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
|
# 0dayxx + pornditt + pornhat — USUNIĘTE CAŁKOWICIE 2026-06-22 (user request): orphan
|
||||||
# cross-IP). Switch na WebView fallback. ~5k scen.
|
# factories (0–0.2% canonical match), zastępujemy lepszymi źródłami. Dane skasowane.
|
||||||
"0dayxxcom": _vps_blocked_fallback.extract,
|
|
||||||
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
|
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
|
||||||
"perverzijacom": _embed_iframe.extract,
|
"perverzijacom": _embed_iframe.extract,
|
||||||
# Special: WebView-only (Yii2 session-bound player).
|
# Special: WebView-only (Yii2 session-bound player).
|
||||||
|
|
@ -174,16 +161,10 @@ _REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
# (#19866e9e wcześniej źle: założyłem „get_file 403 IP-bound" testem plain-curl.)
|
# (#19866e9e wcześniej źle: założyłem „get_file 403 IP-bound" testem plain-curl.)
|
||||||
"fullmoviesxxx": fullmovies.extract,
|
"fullmoviesxxx": fullmovies.extract,
|
||||||
"hdporngg": hdporngg.extract,
|
"hdporngg": hdporngg.extract,
|
||||||
# hqfap — JSON-LD contentUrl = direct mp4 (cdnde.com nowsze / okcdn.ru starsze).
|
# hqfap + 4k69 (PlayTube CMS) — USUNIĘTE CAŁKOWICIE 2026-06-25. Cała biblioteka CDN
|
||||||
# Cross-IP test 2026-06-10: oba CDN-y portable (`ip=`/`srcIp=` nie egzekwowane),
|
# znikła: każda scena serwuje stały `/upload/videos/video_down.mp4` "server down" stub
|
||||||
# tokeny time-bound → on-demand fetch daje świeży URL. Mobile direct, zero proxy.
|
# zamiast realnego pliku (wide-sample przez pełny zakres id: hqfap 0/80 real, 4k69
|
||||||
"hqfapcom": hqfap.extract,
|
# 0/40 real). Dane skasowane, scrapery/extractory usunięte.
|
||||||
# 4k69 — 2026-06-14 player zmigrowany na jwplayer + okcdn.ru (OK.ru CDN). Natywny
|
|
||||||
# fourk69.extract parsuje okcdn `file`+`label` ze strony (SSR za CF → proxy). okcdn
|
|
||||||
# srcIp NIE egzekwowane (cross-IP test) → mobile_direct_ok, telefon gra direct.
|
|
||||||
# Pełny reverse-engineer w fourk69.py (zgłoszenie 5de3fbc5). [Krótko był na
|
|
||||||
# _vps_blocked_fallback/WebView, ale to łapało VAST preroll zamiast contentu.]
|
|
||||||
"4k69com": fourk69.extract,
|
|
||||||
# neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve →
|
# neporn — KVS function/0 + license (jak freshporno). Server-side _kvs resolve →
|
||||||
# data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10).
|
# data001.neporn.com/remote_control.php portable (cross-IP 206, 2026-06-10).
|
||||||
"neporncom": neporn.extract,
|
"neporncom": neporn.extract,
|
||||||
|
|
|
||||||
|
|
@ -215,6 +215,12 @@ _IP_BOUND_CDN_RE = re.compile(
|
||||||
r"premilkyway\.com" # latestpornvideo
|
r"premilkyway\.com" # latestpornvideo
|
||||||
r"|tnmr\.org" # mypornerleak (legacy CDN)
|
r"|tnmr\.org" # mypornerleak (legacy CDN)
|
||||||
r"|acek-cdn\.com" # mypornerleak (current CDN, shared KVS infra)
|
r"|acek-cdn\.com" # mypornerleak (current CDN, shared KVS infra)
|
||||||
|
# xtremestream.xyz (perverzija) — `player/xs1.php?data=` to NIE direct mp4 tylko
|
||||||
|
# IP-bound player endpoint (403 cross-IP z VPS). Stage 1 zwracał go jako type=mp4
|
||||||
|
# → natywny player ładował HTML-player w nieskończoność (reports 06-24 "perverzija
|
||||||
|
# nie działa"/"loading w nieskończoność"). Skip → hoster fallback: WebView ładuje
|
||||||
|
# index.php playera z residential IP telefonu, xs1.php gra w jego sesji.
|
||||||
|
r"|xtremestream\.[a-z]{2,8}"
|
||||||
# URL signature shared across these CDNs: `/hls2/<XX>/<scene_id>/.../master.m3u8?t=<token>&s=<ts>&e=<exp>&srv=<srv>&asn=`
|
# URL signature shared across these CDNs: `/hls2/<XX>/<scene_id>/.../master.m3u8?t=<token>&s=<ts>&e=<exp>&srv=<srv>&asn=`
|
||||||
# — `asn` query param = Autonomous System Number bind. Generic match jako safety net.
|
# — `asn` query param = Autonomous System Number bind. Generic match jako safety net.
|
||||||
r")\b",
|
r")\b",
|
||||||
|
|
|
||||||
|
|
@ -1,68 +0,0 @@
|
||||||
"""4k69.com — okcdn.ru (OK.ru CDN) direct stream extractor.
|
|
||||||
|
|
||||||
2026-06-14: 4k69 zmigrowało player z get_file (4kporno.xxx) na jwplayer + okcdn.ru
|
|
||||||
(OK.ru video CDN). Strona (SSR za Cloudflare → curl_cffi/proxy) ma w inline jwplayer
|
|
||||||
setupie pary `"file": "<okcdn url>", "label": "<jakość>"` na WSZYSTKIE jakości
|
|
||||||
(4K/2K/1080p/720p/480p/360p/240p). To samo w LD-JSON `contentUrl` (jeden, niższy).
|
|
||||||
|
|
||||||
okcdn URL ma `expires=` (time-bound), `srcIp=` (IP edge Cloudflare który frontował
|
|
||||||
fetch) i `sig=` per jakość. KLUCZOWE (reverse-engineer + cross-IP test 2026-06-14):
|
|
||||||
`srcIp` NIE jest egzekwowane — URL gra z dowolnego IP (206 video/mp4 z residential IP
|
|
||||||
≠ srcIp). Więc resolwujemy server-side i oddajemy `mobile_direct_ok` → telefon gra
|
|
||||||
DIRECT, zero VPS proxy, zero WebView/reklam (VAST preroll jest runtime-only, nie ma go
|
|
||||||
w statycznym HTML, więc parsując HTML omijamy go całkiem).
|
|
||||||
|
|
||||||
Pomijamy 4K/2K (jak wcześniej 2160/1440 — za duże na mobile).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.extractors._fetch import fetch_tube_html
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Pary file+label z jwplayer setupu: "file":"<okcdn>","label":"1080p". Label bierzemy
|
|
||||||
# wprost ze strony (pewniejsze niż mapowanie OK.ru type=N).
|
|
||||||
_OKCDN_FILE_RE = re.compile(
|
|
||||||
r'"file"\s*:\s*"(https?://[^"]*okcdn[^"]+)"\s*,\s*"label"\s*:\s*"([^"]+)"',
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
# Za duże na mobile (jak stary skip 2160/1440).
|
|
||||||
_SKIP_LABEL_RE = re.compile(r"^(4k|2k|2160|1440)", re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def _quality_num(label: str) -> int:
|
|
||||||
m = re.match(r"(\d+)", label or "")
|
|
||||||
return int(m.group(1)) if m else 0
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
html = fetch_tube_html(page_url, timeout=timeout)
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
|
||||||
out: list[StreamSource] = []
|
|
||||||
for m in _OKCDN_FILE_RE.finditer(html):
|
|
||||||
url = m.group(1).replace("&", "&")
|
|
||||||
label = m.group(2).strip()
|
|
||||||
if url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(url)
|
|
||||||
if _SKIP_LABEL_RE.match(label):
|
|
||||||
continue
|
|
||||||
out.append(StreamSource(
|
|
||||||
link=url,
|
|
||||||
quality=label,
|
|
||||||
type="mp4",
|
|
||||||
referer="https://4k69.com/",
|
|
||||||
# srcIp nieegzekwowane (cross-IP test 2026-06-14) → telefon gra direct.
|
|
||||||
raw={"mobile_direct_ok": True},
|
|
||||||
))
|
|
||||||
|
|
||||||
if not out:
|
|
||||||
log.info("4k69: no okcdn sources on %s", page_url)
|
|
||||||
return None
|
|
||||||
out.sort(key=lambda s: _quality_num(s.quality or ""), reverse=True)
|
|
||||||
return out
|
|
||||||
|
|
@ -1,79 +0,0 @@
|
||||||
"""hqfap.com — direct stream extractor.
|
|
||||||
|
|
||||||
Scene page (SSR, za Cloudflare → curl_cffi w fetch_tube_html) ma JSON-LD
|
|
||||||
VideoObject z `contentUrl` = direct mp4. Dwie generacje hostingu w katalogu:
|
|
||||||
|
|
||||||
- nowsze sceny: `v4.cdnde.com/...?video=<b64>&time=<epoch>&ip=<addr>` — param
|
|
||||||
`ip` NIE jest egzekwowany (cross-IP test 2026-06-10: lokalny ISP i VPS Hetzner
|
|
||||||
oba 206), token time-bound → resolve on-demand daje świeży URL,
|
|
||||||
- starsze sceny: `vd*.okcdn.ru/?expires=...&srcIp=...&sig=...` (ok.ru) — również
|
|
||||||
portable cross-IP (206 z innego IP niż fetcher).
|
|
||||||
|
|
||||||
Mobile gra direct (mobile_direct auto-detect w playback.py), zero proxy/WebView.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
from app.extractors._fetch import fetch_tube_html
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_JSONLD_RE = re.compile(
|
|
||||||
r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
|
|
||||||
re.IGNORECASE | re.DOTALL,
|
|
||||||
)
|
|
||||||
# Fallback gdy JSON-LD nie parsuje się jako JSON (trailing comma itp.).
|
|
||||||
_CONTENT_URL_RE = re.compile(r'"contentUrl"\s*:\s*"([^"]+)"')
|
|
||||||
_QUALITY_RE = re.compile(r"_(\d{3,4})p\.mp4", re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
html = fetch_tube_html(page_url, timeout=timeout)
|
|
||||||
|
|
||||||
content_url: str | None = None
|
|
||||||
for m in _JSONLD_RE.finditer(html):
|
|
||||||
raw = m.group(1).strip()
|
|
||||||
if not raw:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
data = json.loads(raw)
|
|
||||||
except (json.JSONDecodeError, ValueError):
|
|
||||||
continue
|
|
||||||
items = data if isinstance(data, list) else [data]
|
|
||||||
for obj in items:
|
|
||||||
if isinstance(obj, dict) and obj.get("@type") == "VideoObject":
|
|
||||||
content_url = (obj.get("contentUrl") or "").strip() or None
|
|
||||||
break
|
|
||||||
if content_url:
|
|
||||||
break
|
|
||||||
if not content_url:
|
|
||||||
rm = _CONTENT_URL_RE.search(html)
|
|
||||||
content_url = rm.group(1).strip() if rm else None
|
|
||||||
if not content_url or not content_url.startswith("http"):
|
|
||||||
log.warning("hqfap: no contentUrl in JSON-LD for %s", page_url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# hqfap migrował: `/upload/videos/video_down.mp4` (+ mirror *.workers.dev) serwuje
|
|
||||||
# STAŁY ~3MB placeholder dla KAŻDEJ sceny, niezależnie od deklarowanej długości
|
|
||||||
# (5/5 scen = 3.04MB przy 14-47min, weryfikacja 2026-06-21, browser MediaSource grał
|
|
||||||
# ten sam stub; user-reports „server down" c382d441/ef10b946). To NIE jest realne
|
|
||||||
# wideo → traktujemy jak brak źródła (lepiej żadne niż 3MB „server down" clip).
|
|
||||||
# Realne starsze sceny (cdnde.com / okcdn.ru direct mp4) przechodzą normalnie.
|
|
||||||
if "/upload/videos/video_down.mp4" in content_url:
|
|
||||||
log.info("hqfap: stub video_down.mp4 (placeholder, no real video) on %s", page_url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
qm = _QUALITY_RE.search(content_url)
|
|
||||||
quality = f"{qm.group(1)}p" if qm else None
|
|
||||||
return [
|
|
||||||
StreamSource(
|
|
||||||
link=content_url,
|
|
||||||
quality=quality,
|
|
||||||
type="mp4",
|
|
||||||
referer="https://hqfap.com/",
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
"""pornditt.com — KVS (kt_player) direct stream extractor. Patrz app/extractors/tubes/_kvs.py.
|
|
||||||
|
|
||||||
User bug 2026-05-31 (scene 40f118e1): "Pornditt łapie reklamę zamiast video". pornditt
|
|
||||||
był na _vps_blocked_fallback (WebView), gdzie scrape łapał VAST preroll (trafostatic) zamiast
|
|
||||||
contentu. Identyczny silnik jak yespornvip: flashvars `video_url`/`video_alt_url` =
|
|
||||||
`function/0/...get_file/...` + `license_code`; VPS dociera (HTTP 200). Resolve server-side:
|
|
||||||
decode + follow 302 → portable CDN (twa.tgprn.com, time-bound, NIE IP/cookie-bound —
|
|
||||||
zweryfikowane cross-IP 2026-06-01 fresh session → 206 video/mp4). Native, multi-quality,
|
|
||||||
zero WebView/reklam.
|
|
||||||
|
|
||||||
NB: runtime `window.flashvars.video_url` pokazuje już ZDEKODOWANY plain get_file, ale raw
|
|
||||||
HTML (server-fetch) ma formę `function/0/...` + license — dekodujemy sami (_kvs.real_url).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
from app.extractors.tubes import _kvs
|
|
||||||
|
|
||||||
_BASE = "https://v.pornditt.com"
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
return _kvs.resolve_kvs(page_url, base_url=_BASE, timeout=timeout)
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
"""pornhat.com — KVS engine. get_file 302 → HLS m3u8 manifest.
|
|
||||||
|
|
||||||
**2026-05-18 bandwidth optimization**: pornhat CDN tokens (`cdn.privatehost.com`) są
|
|
||||||
**time-bound, nie IP-bound** (`?sign=<HMAC>&exp_time=<unix>`). Zweryfikowane Chrome
|
|
||||||
DevTools MCP — VPS-resolved URL działa z każdego IP, bez Referer header. Zamiast
|
|
||||||
zwracać `pornhat.com/get_file/` URL (mobile dostaje go i robi 302 chain przez VPS
|
|
||||||
proxy), robimy server-side resolve i zwracamy końcowy manifest URL z signed token.
|
|
||||||
|
|
||||||
Mobile ExoPlayer otrzymuje:
|
|
||||||
`https://nvms12.cdn.privatehost.com/hls/contents/.../?sign=...&exp_time=...`
|
|
||||||
i pobiera manifest + segments direct z CDN. **Zero VPS bandwidth** (poza ~5KB
|
|
||||||
initial resolve fetch).
|
|
||||||
|
|
||||||
`mobile_direct_ok=True` w `raw` mówi playback.py że dla type=m3u8 ten URL jest OK
|
|
||||||
dla `direct_url=raw_url` (zazwyczaj m3u8 by szły przez proxy).
|
|
||||||
|
|
||||||
Token wygasa za ~30-120 min od resolve (depends na lra param). User pause+resume
|
|
||||||
po >2h może dostać 403 → mobile fallback na proxified URL re-resolve'a.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from app.extractors._models import StreamSource
|
|
||||||
from app.extractors.tubes._kvs_source import extract_kvs_sources
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_get_file_redirect(get_file_url: str, *, timeout: float = 15.0) -> str | None:
|
|
||||||
"""Follow 302 chain pornhat.com/get_file/ → cdn.privatehost.com/hls/...
|
|
||||||
|
|
||||||
Returns final manifest URL z signed token, lub None gdy fail.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with httpx.Client(
|
|
||||||
timeout=timeout,
|
|
||||||
follow_redirects=True,
|
|
||||||
headers={
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
||||||
"Referer": "https://www.pornhat.com/",
|
|
||||||
},
|
|
||||||
) as c:
|
|
||||||
r = c.head(get_file_url)
|
|
||||||
final = str(r.url)
|
|
||||||
if "cdn.privatehost.com" in final and ".m3u8" not in final:
|
|
||||||
# Generic master URL: /hls/contents/... CDN serves jako m3u8 mime
|
|
||||||
# nawet bez .m3u8 w path (sprawdzone Content-Type).
|
|
||||||
return final
|
|
||||||
if ".m3u8" in final:
|
|
||||||
return final
|
|
||||||
log.info("pornhat resolve: unexpected final URL %s", final)
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("pornhat resolve %s failed: %s", get_file_url, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract(page_url: str, *, timeout: float = 60.0) -> list[StreamSource] | None:
|
|
||||||
sources = extract_kvs_sources(
|
|
||||||
page_url, stream_type="m3u8", timeout=timeout, log_tag="pornhat"
|
|
||||||
)
|
|
||||||
if not sources:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Resolve każdy get_file URL → CDN signed manifest URL. Mobile dostaje direct.
|
|
||||||
resolved: list[StreamSource] = []
|
|
||||||
for s in sources:
|
|
||||||
final = _resolve_get_file_redirect(s.link)
|
|
||||||
if final:
|
|
||||||
resolved.append(
|
|
||||||
StreamSource(
|
|
||||||
link=final,
|
|
||||||
type="m3u8",
|
|
||||||
quality=s.quality,
|
|
||||||
referer=s.referer,
|
|
||||||
raw={"mobile_direct_ok": True},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Fallback: keep original (proxy will re-resolve)
|
|
||||||
resolved.append(s)
|
|
||||||
|
|
||||||
return resolved
|
|
||||||
|
|
@ -16,6 +16,13 @@ export type ChangelogEntry = {
|
||||||
};
|
};
|
||||||
|
|
||||||
export const CHANGELOG: ChangelogEntry[] = [
|
export const CHANGELOG: ChangelogEntry[] = [
|
||||||
|
{
|
||||||
|
id: '2026-06-25',
|
||||||
|
date: 'June 2026',
|
||||||
|
items: [
|
||||||
|
'sxyprn and a few other sites: if a video fails to start, the app now grabs a fresh link and retries automatically instead of just hanging.',
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
id: '2026-06-22b',
|
id: '2026-06-22b',
|
||||||
date: 'June 2026',
|
date: 'June 2026',
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,8 @@ export type RootStackParamList = {
|
||||||
// 'tube:<sitetag>' źródła — telemetria odtwarzania zasilająca ranking źródeł.
|
// 'tube:<sitetag>' źródła — telemetria odtwarzania zasilająca ranking źródeł.
|
||||||
// Opcjonalne; brak → telemetria pomijana (canonical/non-tube).
|
// Opcjonalne; brak → telemetria pomijana (canonical/non-tube).
|
||||||
origin?: string;
|
origin?: string;
|
||||||
|
// Post page URL dla IP-bound tubów (sxyprn/eporner/fpoxxx) — re-resolve on error.
|
||||||
|
resolvePageUrl?: string;
|
||||||
// 'movie' = MovieDetail wywołał Player z movieId zamiast sceneId. Backend
|
// 'movie' = MovieDetail wywołał Player z movieId zamiast sceneId. Backend
|
||||||
// ma /movies/{id}/progress oddzielnie od /scenes/{id}/progress (2026-05-28).
|
// ma /movies/{id}/progress oddzielnie od /scenes/{id}/progress (2026-05-28).
|
||||||
// Default 'scene' dla back-compat z istniejącymi nav callami.
|
// Default 'scene' dla back-compat z istniejącymi nav callami.
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ import { WebView, type WebViewMessageEvent } from 'react-native-webview';
|
||||||
import { useClient } from '../ClientContext';
|
import { useClient } from '../ClientContext';
|
||||||
import type { RootStackParamList } from '../navigation';
|
import type { RootStackParamList } from '../navigation';
|
||||||
import { theme } from '../theme';
|
import { theme } from '../theme';
|
||||||
|
import type { StreamLink } from '../types';
|
||||||
|
|
||||||
interface RouteParams {
|
interface RouteParams {
|
||||||
url: string;
|
url: string;
|
||||||
|
|
@ -30,6 +31,11 @@ interface RouteParams {
|
||||||
// 'tube:<sitetag>' źródła — do telemetrii odtwarzania (ranking źródeł). Opcjonalne;
|
// 'tube:<sitetag>' źródła — do telemetrii odtwarzania (ranking źródeł). Opcjonalne;
|
||||||
// brak → telemetria pomijana (np. canonical/paradisehill bez tube-origin).
|
// brak → telemetria pomijana (np. canonical/paradisehill bez tube-origin).
|
||||||
origin?: string;
|
origin?: string;
|
||||||
|
// Post/scene page URL dla tubów IP-bound resolwowanych phone-side (sxyprn/eporner/
|
||||||
|
// fpoxxx). Gdy native player padnie na initial-load (token bound do innego IP /
|
||||||
|
// wygasł), Player RE-RESOLVUJE świeżo z tej strony (nowy token, bieżący IP) zamiast
|
||||||
|
// retry martwego URL-a. Zero VPS bandwidth. Ustawiane przez SceneDetail.openAsVideo.
|
||||||
|
resolvePageUrl?: string;
|
||||||
// 'scene' (default — back-compat z istniejącymi nav callami) lub 'movie'.
|
// 'scene' (default — back-compat z istniejącymi nav callami) lub 'movie'.
|
||||||
// Player dispatcheruje upsertProgress vs upsertMovieProgress. Wcześniej
|
// Player dispatcheruje upsertProgress vs upsertMovieProgress. Wcześniej
|
||||||
// MovieDetail przekazywał movieId jako sceneId — backend /scenes/<movieId>/
|
// MovieDetail przekazywał movieId jako sceneId — backend /scenes/<movieId>/
|
||||||
|
|
@ -140,7 +146,7 @@ export function PlayerScreen() {
|
||||||
function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
||||||
const client = useClient();
|
const client = useClient();
|
||||||
const nav = useNavigation<NativeStackNavigationProp<RootStackParamList, 'Player'>>();
|
const nav = useNavigation<NativeStackNavigationProp<RootStackParamList, 'Player'>>();
|
||||||
const { url, sceneId, origin: playOrigin, entityKind, durationSec, refererHost, title, fallbackEmbedUrl, headers: paramHeaders, fallbackProxyUrl } = params;
|
const { url, sceneId, origin: playOrigin, resolvePageUrl, entityKind, durationSec, refererHost, title, fallbackEmbedUrl, headers: paramHeaders, fallbackProxyUrl } = params;
|
||||||
const { markBroken, canMark, busy: markBusy } = useMarkSourceBroken(params);
|
const { markBroken, canMark, busy: markBusy } = useMarkSourceBroken(params);
|
||||||
// 'movie' → /movies/{id}/progress, 'scene' (default) → /scenes/{id}/progress.
|
// 'movie' → /movies/{id}/progress, 'scene' (default) → /scenes/{id}/progress.
|
||||||
const upsertProgress = React.useCallback(
|
const upsertProgress = React.useCallback(
|
||||||
|
|
@ -199,6 +205,14 @@ function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
||||||
// Każdy step ma osobną ref żeby nie loopować.
|
// Każdy step ma osobną ref żeby nie loopować.
|
||||||
const didFallbackProxyRef = React.useRef(false);
|
const didFallbackProxyRef = React.useRef(false);
|
||||||
const didFallbackWebViewRef = React.useRef(false);
|
const didFallbackWebViewRef = React.useRef(false);
|
||||||
|
// Re-resolve dla IP-bound tubów (sxyprn/eporner/fpoxxx): token jest bound do IP
|
||||||
|
// które pobrało stronę; jeśli IP się zmieniło (CGNAT/przełączenie sieci) albo token
|
||||||
|
// wygasł, native player pada na initial-load. Zamiast retry martwego URL-a pobieramy
|
||||||
|
// stronę ŚWIEŻO (bieżący IP) i podmieniamy źródło. Flaga `reResolveDone` gate'uje
|
||||||
|
// łańcuch fallback (proxy/WebView) póki re-resolve nie skończy — i jest no-op dla
|
||||||
|
// tubów BEZ resolvePageUrl (czyli zero wpływu na resztę).
|
||||||
|
const didReResolveRef = React.useRef(false);
|
||||||
|
const [reResolveDone, setReResolveDone] = React.useState(false);
|
||||||
// Seek/decode recovery (bug f6c86847: doply/playmogo „invalid NAL length” przy
|
// Seek/decode recovery (bug f6c86847: doply/playmogo „invalid NAL length” przy
|
||||||
// przewijaniu). Stream jest poprawny — faststart MP4, CDN wspiera Range 206
|
// przewijaniu). Stream jest poprawny — faststart MP4, CDN wspiera Range 206
|
||||||
// (zweryfikowane 2026-06-01 cross-IP) — więc to wewnętrzny błąd seeka ExoPlayera,
|
// (zweryfikowane 2026-06-01 cross-IP) — więc to wewnętrzny błąd seeka ExoPlayera,
|
||||||
|
|
@ -212,6 +226,45 @@ function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
||||||
React.useEffect(() => {
|
React.useEffect(() => {
|
||||||
if (status === 'readyToPlay') loadedOnceRef.current = true;
|
if (status === 'readyToPlay') loadedOnceRef.current = true;
|
||||||
}, [status]);
|
}, [status]);
|
||||||
|
|
||||||
|
// Re-resolve IP-bound tubów (sxyprn/eporner/fpoxxx) na initial-load error: pobierz
|
||||||
|
// stronę ŚWIEŻO z urządzenia (token bound do bieżącego IP) i podmień źródło. Tylko
|
||||||
|
// 1× / mount. No-op gdy brak resolvePageUrl. Po zakończeniu (sukces lub nie)
|
||||||
|
// ustawia reResolveDone → odblokowuje łańcuch fallback gdy nie pomogło.
|
||||||
|
React.useEffect(() => {
|
||||||
|
if (status !== 'error' || loadedOnceRef.current) return;
|
||||||
|
if (didReResolveRef.current || !resolvePageUrl || !playOrigin) return;
|
||||||
|
if (isGoneError(playerError?.message)) return; // skasowany post → niech łańcuch oznaczy dead
|
||||||
|
didReResolveRef.current = true;
|
||||||
|
let cancelled = false;
|
||||||
|
(async () => {
|
||||||
|
try {
|
||||||
|
let links: StreamLink[] = [];
|
||||||
|
if (playOrigin === 'tube:sxyprncom') {
|
||||||
|
links = await (await import('../lib/sxyprnResolver')).resolveSxyprnPage(resolvePageUrl);
|
||||||
|
} else if (playOrigin === 'tube:epornercom') {
|
||||||
|
links = await (await import('../lib/epornerResolver')).resolveEpornerPage(resolvePageUrl);
|
||||||
|
} else if (playOrigin === 'tube:fpoxxx') {
|
||||||
|
links = await (await import('../lib/fpoxxxResolver')).resolveFpoxxxPage(resolvePageUrl);
|
||||||
|
}
|
||||||
|
const fresh = links?.[0];
|
||||||
|
const freshUrl = fresh?.direct_url || fresh?.stream_url;
|
||||||
|
if (!cancelled && freshUrl && freshUrl !== url) {
|
||||||
|
player.replace(fresh?.headers ? { uri: freshUrl, headers: fresh.headers } : freshUrl);
|
||||||
|
player.play();
|
||||||
|
return; // sukces → status zmieni się z 'error', łańcuch fallback nie ruszy
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore → łańcuch fallback przejmie
|
||||||
|
} finally {
|
||||||
|
if (!cancelled) setReResolveDone(true);
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
return () => {
|
||||||
|
cancelled = true;
|
||||||
|
};
|
||||||
|
}, [status, resolvePageUrl, playOrigin, playerError, player, url]);
|
||||||
|
|
||||||
React.useEffect(() => {
|
React.useEffect(() => {
|
||||||
if (status !== 'error') return;
|
if (status !== 'error') return;
|
||||||
// Step 0: post-load decode/seek error → recover in-place (przed proxy/WebView,
|
// Step 0: post-load decode/seek error → recover in-place (przed proxy/WebView,
|
||||||
|
|
@ -238,6 +291,9 @@ function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// Gate: dla IP-bound tubów (resolvePageUrl) poczekaj aż re-resolve się zakończy
|
||||||
|
// zanim ruszysz proxy/WebView. No-op gdy brak resolvePageUrl (reszta tubów).
|
||||||
|
if (resolvePageUrl && !reResolveDone) return;
|
||||||
// Step 1 → 2: direct fail (403/410/etc), spróbuj proxy URL.
|
// Step 1 → 2: direct fail (403/410/etc), spróbuj proxy URL.
|
||||||
if (fallbackProxyUrl && !didFallbackProxyRef.current && url !== fallbackProxyUrl) {
|
if (fallbackProxyUrl && !didFallbackProxyRef.current && url !== fallbackProxyUrl) {
|
||||||
didFallbackProxyRef.current = true;
|
didFallbackProxyRef.current = true;
|
||||||
|
|
@ -272,7 +328,7 @@ function NativeVideoPlayer({ params }: { params: RouteParams }) {
|
||||||
mode: 'webview',
|
mode: 'webview',
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}, [status, fallbackProxyUrl, fallbackEmbedUrl, url, nav, sceneId, durationSec, refererHost, title, player, source, playerError]);
|
}, [status, fallbackProxyUrl, fallbackEmbedUrl, url, nav, sceneId, durationSec, refererHost, title, player, source, playerError, resolvePageUrl, reResolveDone, playOrigin]);
|
||||||
|
|
||||||
// Telemetria odtwarzania (ranking źródeł). Tylko native-player path (WebView mode
|
// Telemetria odtwarzania (ranking źródeł). Tylko native-player path (WebView mode
|
||||||
// ma osobny komponent, nie umiemy tam wykryć sukcesu → pomijamy, fair). Jeden ping
|
// ma osobny komponent, nie umiemy tam wykryć sukcesu → pomijamy, fair). Jeden ping
|
||||||
|
|
|
||||||
|
|
@ -548,10 +548,14 @@ function PlaybackButton({
|
||||||
if (resolved) initialUrl = resolved;
|
if (resolved) initialUrl = resolved;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IP-bound tuby resolwowane phone-side: przekaż page_url, by Player mógł
|
||||||
|
// re-resolve'ować świeży token gdy native padnie na initial-load (zmiana IP / TTL).
|
||||||
|
const PHONE_RESOLVE_ORIGINS = ['tube:sxyprncom', 'tube:epornercom', 'tube:fpoxxx'];
|
||||||
nav.navigate('Player', {
|
nav.navigate('Player', {
|
||||||
url: initialUrl,
|
url: initialUrl,
|
||||||
sceneId,
|
sceneId,
|
||||||
origin: source.origin,
|
origin: source.origin,
|
||||||
|
resolvePageUrl: PHONE_RESOLVE_ORIGINS.includes(source.origin) ? source.page_url : undefined,
|
||||||
playbackId: source.id,
|
playbackId: source.id,
|
||||||
durationSec: sceneDurationSec,
|
durationSec: sceneDurationSec,
|
||||||
refererHost,
|
refererHost,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue