From 642f1ab8b888fa438508638d011b953429c21d29 Mon Sep 17 00:00:00 2001 From: "https://github.com/goon-foss/goon" Date: Fri, 22 May 2026 11:20:57 +0200 Subject: [PATCH] Mobile 0.1.9: OTA enable, WebView cookie-dismiss fix, porndoe connector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mobile / OTA: - Enable Expo Updates (app.json + AndroidManifest) → api.goon-foss.org - Bump 0.1.6 → 0.1.9 (build.gradle, app.json, appVersion.ts, main.py /version) - backend.ts: default public backend auto-connect (no manual login) WebView fallback fix (PlayerScreen INJECTED_JS): - Auto-dismiss cookie/consent gates (hqporner et al. blocked kt_player init) - Context-scoped: only clicks consent buttons inside cookie/gdpr containers - Retry window for .src polling raised 5→15 ticks (post-dismiss init) Resolver: - Series-position + modifier mismatch detector (Episode 2≠4, BTS/unedited) → composite_score hard-reject / cap; wired into scene_score + bulk_dedup - aggregator-mode candidate query: LIMIT 500 + title-match ordering Connectors: - porndoe.com browse scraper (JSON-LD VideoObject) — theporndude audit pilot landing: APK links → goon-v0.1.9.apk Co-Authored-By: Claude Opus 4.7 --- .gitignore | 7 + app/api/playback.py | 22 +- app/config.py | 13 +- app/connectors/direct_scrapers/__init__.py | 8 + app/connectors/direct_scrapers/freshporno.py | 14 + app/connectors/direct_scrapers/porndoe.py | 271 ++++++++++++++++++ app/extractors/__init__.py | 29 +- app/extractors/tubes/freshporno.py | 10 +- app/extractors/tubes/hqporner.py | 10 +- app/main.py | 2 +- app/resolve/scene_resolver.py | 82 +++++- app/resolve/scene_score.py | 6 + app/resolve/scoring.py | 105 +++++++ app/scheduler/bulk_dedup.py | 4 + app/scheduler/jobs.py | 34 +++ app/scheduler/performer_driven.py | 48 +++- app/scheduler/worker.py | 3 + landing/index.html | 74 +++-- mobile/App.tsx | 9 +- mobile/android/app/build.gradle | 4 +- .../android/app/src/main/AndroidManifest.xml | 12 +- mobile/app.json | 6 +- mobile/src/lib/appVersion.ts | 2 +- mobile/src/lib/backend.ts | 12 + mobile/src/screens/PlayerScreen.tsx | 51 +++- scripts/check_all_hosters.py | 50 ++++ scripts/check_series_detector.py | 29 ++ scripts/goon_debug_proxy.py | 92 ++++++ scripts/test_porndoe_scraper.py | 59 ++++ scripts/theporndude_coverage_check.py | 29 ++ scripts/theporndude_coverage_match.py | 118 ++++++++ scripts/theporndude_curl_triage.py | 176 ++++++++++++ scripts/theporndude_movies_pipeline.py | 234 +++++++++++++++ scripts/theporndude_resolve_domains.py | 87 ++++++ scripts/theporndude_scorecard.py | 104 +++++++ tests/test_scoring.py | 109 +++++++ 36 files changed, 1825 insertions(+), 100 deletions(-) create mode 100644 app/connectors/direct_scrapers/porndoe.py create mode 100644 mobile/src/lib/backend.ts create mode 100644 scripts/check_all_hosters.py create mode 100644 scripts/check_series_detector.py create mode 100644 scripts/goon_debug_proxy.py create mode 100644 scripts/test_porndoe_scraper.py create mode 100644 scripts/theporndude_coverage_check.py create mode 100644 scripts/theporndude_coverage_match.py create mode 100644 scripts/theporndude_curl_triage.py create mode 100644 scripts/theporndude_movies_pipeline.py create mode 100644 scripts/theporndude_resolve_domains.py create mode 100644 scripts/theporndude_scorecard.py diff --git a/.gitignore b/.gitignore index 3726b50..21b4bf0 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,13 @@ mcp-logs.txt # ADB / development debug artefakty (screenshots, ui dumps) .tmp_adb/ +# Marketing screenshots — kept local, hosted externally for posts/landing. +# NOT committed: explicit thumbnails risk GitHub TOS takedown. +screenshots/ + +# Launch / marketing material — local working notes, not part of the codebase. +launch/ + # Operational deploy scripts — moved to a private companion repo. Public repo # should NOT contain SSH commands, systemd units, or smoke-test playbooks # referencing concrete hosts. diff --git a/app/api/playback.py b/app/api/playback.py index 9bc8474..e391dac 100644 --- a/app/api/playback.py +++ b/app/api/playback.py @@ -184,21 +184,19 @@ def resolve_movie_playback( pb.id, ) stream = None + # Mixdrop mxcontent CDN wymaga curl_cffi JA3 → wymusza VPS proxy. + # Pre-public: skip mixdrop direct, fallback na embed_url (mobile WebView z + # phone IP). Bandwidth + anonimowość VPS > UX. Movie ma zwykle 10+ alt + # hosterów (voe/luluvid/doply/etc.), user może wybrać alternative. + if stream and "mxcontent.net" in stream.lower(): + log.info( + "movie playback %s: mixdrop mxcontent — skip (VPS-proxy required), WebView fallback", + pb.id, + ) + stream = None if stream: type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4" - # Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop): - # proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True` - # idzie przez `raw` → `_proxify_link` ustawi token `i=1`. - cdn_needs_impersonate = "mxcontent.net" in stream.lower() raw_meta: dict = {"origin": pb.origin, "host": target} - if cdn_needs_impersonate: - raw_meta["proxy_impersonate"] = True - # Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4. - # Backend extract zamknął sesję — proxy musi re-fetchować - # embed page w fresh curl_cffi session żeby re-extract mp4 - # z aktualnymi cookies. - raw_meta["refetch_url"] = target - raw_meta["refetch_hoster"] = "mixdrop" links.append( StreamLink( stream_url=stream, diff --git a/app/config.py b/app/config.py index fe1862d..f28eaf5 100644 --- a/app/config.py +++ b/app/config.py @@ -72,13 +72,22 @@ class Settings(BaseSettings): sched_movie_ingest_hours: int = Field( default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS" ) - # Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie. + # Browse-latest scheduler: freshporno/porn00/pornxp newest scenes. + # 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po + # 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h + # cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno. + # 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik. sched_browse_latest_hours: int = Field( - default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS" + default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS" ) sched_browse_latest_max_pages: int = Field( default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES" ) + # Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time + # scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run). + sched_bulk_dedup_hours: int = Field( + default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS" + ) # Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens # w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log). diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index b65f05a..16bb444 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -137,6 +137,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.** from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402 from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402 +from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402 from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402 from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401 @@ -152,6 +153,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title # fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować. Porn00Scraper, + # PornDoeScraper — dołączony 2026-05-21 (theporndude audit). Każda scena ma + # kompletny JSON-LD VideoObject: title + uploadDate + duration + named studio + # (producer/publisher) + named performers (actor[]) + thumbnail. Najbogatsze + # strukturalne metadane spośród browse scraperów — composite fuzzy match ma + # komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio + + # performer + date + duration nadrabiają. + PornDoeScraper, # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). # Follow-up: dorobić te tubey i sprawdzić phash distance: # - fullmovies.xxx (channel/network/pornstars/categories, brak duration) diff --git a/app/connectors/direct_scrapers/freshporno.py b/app/connectors/direct_scrapers/freshporno.py index dda0dc5..0390985 100644 --- a/app/connectors/direct_scrapers/freshporno.py +++ b/app/connectors/direct_scrapers/freshporno.py @@ -163,11 +163,25 @@ class FreshpornoScraper(BaseBrowseScraper): ) ] + # Release date — freshporno emituje ``. + # To data wrzucenia na freshporno, NIE oryginalna release_date studio — ale dla + # świeżych scen (uploaded niedługo po publikacji) różnica ≤ 3-7 dni, mieści się w + # `date_window_days=7` w resolverze. Bez tego pola scene NULL → match score 0 → + # duplicate scene zamiast freshporno PS dodane do TPDB canonical (bug-report + # 2026-05-20: brak Brazzers Exxtra po 15-05). + release_date_parsed: date | None = None + if (m := re.search(r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', detail_html)): + try: + release_date_parsed = date.fromisoformat(m.group(1)) + except ValueError: + pass + return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, + release_date=release_date_parsed, url=scene_url, studio=studio, performers=performers, diff --git a/app/connectors/direct_scrapers/porndoe.py b/app/connectors/direct_scrapers/porndoe.py new file mode 100644 index 0000000..84315fc --- /dev/null +++ b/app/connectors/direct_scrapers/porndoe.py @@ -0,0 +1,271 @@ +"""porndoe.com — latest-vids browse scraper. + +Dołączony 2026-05-21 (theporndude audit). Jedyny verified high-value candidate +z 172 tube'ów na theporndude.com/top-porn-tube-sites + /full-porn-movies-sites. + +Czemu wart: każda scena ma kompletny **JSON-LD VideoObject** schema: + - name (title), description, uploadDate (ISO timestamp), duration (ISO 8601) + - producer + publisher → named studio z `/channel-profile/` URL + - actor[] → named performers z `/pornstars-profile/` URL + - thumbnailUrl (CDN p.cdnc.porndoe.com) + +To wystarczy do composite fuzzy match w resolverze (studio + performer Jaccard + +date proximity + title token-set + duration). Phash hit-rate niski (porndoe robi +własne crop-thumbnaile 390x219, nie hot-linkuje studio art) — ale rich metadata +nadrabia, jak pornxp/porn00. + +URL patterns: + - Listing: `/videos/most-recent?page=N` (page 1 = newest, ~31 scen/page) + - Scene: `/watch/` gdzie id = `pd` + 10 alfanum (stable) + - Studio: `/channel-profile/` + - Performer: `/pornstars-profile/` + - Tags/categories: `/categories/` + +Playback: stream URL NIE jest inline w SSR HTML — player JS init dopiero po user +"Play" click. Dajemy playback_source z page_url + origin `tube:porndoecom`; +extractor w `_REGISTRY` mapuje na `_vps_blocked_fallback.extract` → mobile WebView +INJECTED_JS scrapuje `