diff --git a/.gitignore b/.gitignore index 3726b50..21b4bf0 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,13 @@ mcp-logs.txt # ADB / development debug artefakty (screenshots, ui dumps) .tmp_adb/ +# Marketing screenshots — kept local, hosted externally for posts/landing. +# NOT committed: explicit thumbnails risk GitHub TOS takedown. +screenshots/ + +# Launch / marketing material — local working notes, not part of the codebase. +launch/ + # Operational deploy scripts — moved to a private companion repo. Public repo # should NOT contain SSH commands, systemd units, or smoke-test playbooks # referencing concrete hosts. diff --git a/app/api/playback.py b/app/api/playback.py index 9bc8474..e391dac 100644 --- a/app/api/playback.py +++ b/app/api/playback.py @@ -184,21 +184,19 @@ def resolve_movie_playback( pb.id, ) stream = None + # Mixdrop mxcontent CDN wymaga curl_cffi JA3 → wymusza VPS proxy. + # Pre-public: skip mixdrop direct, fallback na embed_url (mobile WebView z + # phone IP). Bandwidth + anonimowość VPS > UX. Movie ma zwykle 10+ alt + # hosterów (voe/luluvid/doply/etc.), user może wybrać alternative. + if stream and "mxcontent.net" in stream.lower(): + log.info( + "movie playback %s: mixdrop mxcontent — skip (VPS-proxy required), WebView fallback", + pb.id, + ) + stream = None if stream: type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4" - # Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop): - # proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True` - # idzie przez `raw` → `_proxify_link` ustawi token `i=1`. - cdn_needs_impersonate = "mxcontent.net" in stream.lower() raw_meta: dict = {"origin": pb.origin, "host": target} - if cdn_needs_impersonate: - raw_meta["proxy_impersonate"] = True - # Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4. - # Backend extract zamknął sesję — proxy musi re-fetchować - # embed page w fresh curl_cffi session żeby re-extract mp4 - # z aktualnymi cookies. - raw_meta["refetch_url"] = target - raw_meta["refetch_hoster"] = "mixdrop" links.append( StreamLink( stream_url=stream, diff --git a/app/config.py b/app/config.py index fe1862d..f28eaf5 100644 --- a/app/config.py +++ b/app/config.py @@ -72,13 +72,22 @@ class Settings(BaseSettings): sched_movie_ingest_hours: int = Field( default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS" ) - # Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie. + # Browse-latest scheduler: freshporno/porn00/pornxp newest scenes. + # 6h cadence (zmiana z 24h 2026-05-20): user reportował brak Brazzers Exxtra po + # 15-05. Root cause był 2-fold: (1) freshporno publikuje sceny w ciągu dnia, 24h + # cadence łapie tylko te do 05:30 UTC; (2) meta_content/release_date bug osobno. + # 6h = 4 runs/dzień = każda freshporno scena zaingestowana w ciągu ~6h od publik. sched_browse_latest_hours: int = Field( - default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS" + default=6, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS" ) sched_browse_latest_max_pages: int = Field( default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES" ) + # Bulk-dedup performers safety net — auto-merge duplikatów które resolver-time + # scoring pominął. 12h cadence: leci 2x dziennie (po porannym browse-latest run). + sched_bulk_dedup_hours: int = Field( + default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS" + ) # Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens # w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log). diff --git a/app/connectors/direct_scrapers/__init__.py b/app/connectors/direct_scrapers/__init__.py index b65f05a..16bb444 100644 --- a/app/connectors/direct_scrapers/__init__.py +++ b/app/connectors/direct_scrapers/__init__.py @@ -137,6 +137,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [ # (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.** from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402 from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402 +from app.connectors.direct_scrapers.porndoe import PornDoeScraper # noqa: E402 from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402 from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401 @@ -152,6 +153,13 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ # 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title # fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować. Porn00Scraper, + # PornDoeScraper — dołączony 2026-05-21 (theporndude audit). Każda scena ma + # kompletny JSON-LD VideoObject: title + uploadDate + duration + named studio + # (producer/publisher) + named performers (actor[]) + thumbnail. Najbogatsze + # strukturalne metadane spośród browse scraperów — composite fuzzy match ma + # komplet sygnałów. Phash hit-rate niski (własne crop-thumbnaile), studio + + # performer + date + duration nadrabiają. + PornDoeScraper, # ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory). # Follow-up: dorobić te tubey i sprawdzić phash distance: # - fullmovies.xxx (channel/network/pornstars/categories, brak duration) diff --git a/app/connectors/direct_scrapers/freshporno.py b/app/connectors/direct_scrapers/freshporno.py index dda0dc5..0390985 100644 --- a/app/connectors/direct_scrapers/freshporno.py +++ b/app/connectors/direct_scrapers/freshporno.py @@ -163,11 +163,25 @@ class FreshpornoScraper(BaseBrowseScraper): ) ] + # Release date — freshporno emituje ``. + # To data wrzucenia na freshporno, NIE oryginalna release_date studio — ale dla + # świeżych scen (uploaded niedługo po publikacji) różnica ≤ 3-7 dni, mieści się w + # `date_window_days=7` w resolverze. Bez tego pola scene NULL → match score 0 → + # duplicate scene zamiast freshporno PS dodane do TPDB canonical (bug-report + # 2026-05-20: brak Brazzers Exxtra po 15-05). + release_date_parsed: date | None = None + if (m := re.search(r'itemprop="uploadDate"[^>]+content="(\d{4}-\d{2}-\d{2})', detail_html)): + try: + release_date_parsed = date.fromisoformat(m.group(1)) + except ValueError: + pass + return RawScene( external_id=f"{self.sitetag}:{scene_url}", title=title, description=description, duration_sec=duration_sec, + release_date=release_date_parsed, url=scene_url, studio=studio, performers=performers, diff --git a/app/connectors/direct_scrapers/porndoe.py b/app/connectors/direct_scrapers/porndoe.py new file mode 100644 index 0000000..84315fc --- /dev/null +++ b/app/connectors/direct_scrapers/porndoe.py @@ -0,0 +1,271 @@ +"""porndoe.com — latest-vids browse scraper. + +Dołączony 2026-05-21 (theporndude audit). Jedyny verified high-value candidate +z 172 tube'ów na theporndude.com/top-porn-tube-sites + /full-porn-movies-sites. + +Czemu wart: każda scena ma kompletny **JSON-LD VideoObject** schema: + - name (title), description, uploadDate (ISO timestamp), duration (ISO 8601) + - producer + publisher → named studio z `/channel-profile/` URL + - actor[] → named performers z `/pornstars-profile/` URL + - thumbnailUrl (CDN p.cdnc.porndoe.com) + +To wystarczy do composite fuzzy match w resolverze (studio + performer Jaccard + +date proximity + title token-set + duration). Phash hit-rate niski (porndoe robi +własne crop-thumbnaile 390x219, nie hot-linkuje studio art) — ale rich metadata +nadrabia, jak pornxp/porn00. + +URL patterns: + - Listing: `/videos/most-recent?page=N` (page 1 = newest, ~31 scen/page) + - Scene: `/watch/` gdzie id = `pd` + 10 alfanum (stable) + - Studio: `/channel-profile/` + - Performer: `/pornstars-profile/` + - Tags/categories: `/categories/` + +Playback: stream URL NIE jest inline w SSR HTML — player JS init dopiero po user +"Play" click. Dajemy playback_source z page_url + origin `tube:porndoecom`; +extractor w `_REGISTRY` mapuje na `_vps_blocked_fallback.extract` → mobile WebView +INJECTED_JS scrapuje `