Initial commit

Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
This commit is contained in:
goon-foss 2026-05-20 10:10:22 +02:00
commit ad0284585b
329 changed files with 51795 additions and 0 deletions

29
.env.example Normal file
View file

@ -0,0 +1,29 @@
POSTGRES_USER=goon
POSTGRES_PASSWORD=goon
POSTGRES_DB=goon
POSTGRES_PORT=5432
API_PORT=8000
DATABASE_URL=postgresql+psycopg://goon:goon@localhost:5432/goon
# TPDB (theporndb.net) — required for canonical scene metadata + performer canonicalization.
# Get token from your TPDB account settings.
TPDB_API_TOKEN=
TPDB_BASE_URL=https://api.theporndb.net
# StashDB — second canonical source. Required for full performer/scene cross-source dedup.
STASHDB_API_KEY=
STASHDB_GRAPHQL_URL=https://stashdb.org/graphql
LOG_LEVEL=INFO
# Comma-separated list of API keys. Empty = auth disabled (only safe for localhost).
# Generate with: python -c "import secrets; print(secrets.token_urlsafe(32))"
API_KEYS=
# Sentry observability — empty = init no-op (no telemetry sent).
# Set your own DSN if you self-host Sentry or use cloud free tier.
SENTRY_DSN=
SENTRY_ENVIRONMENT=dev
SENTRY_TRACES_SAMPLE_RATE=0.1

32
.github/workflows/backend-tests.yml vendored Normal file
View file

@ -0,0 +1,32 @@
name: Backend tests
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .[dev]
- name: Lint (ruff)
run: ruff check app/ tests/
- name: Run pytest
run: pytest --tb=short

85
.github/workflows/build-apk.yml vendored Normal file
View file

@ -0,0 +1,85 @@
name: Build Android APK
on:
push:
tags:
- 'v*'
workflow_dispatch:
jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
cache-dependency-path: mobile/package-lock.json
- name: Setup Java
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '17'
- name: Setup Gradle cache
uses: gradle/actions/setup-gradle@v4
- name: Install npm dependencies
working-directory: mobile
run: npm ci
- name: Pre-bundle JS for debug embedding
# Default RN debug builds don't embed JS bundle (expects Metro server).
# We explicitly run Expo's `export:embed` so the resulting APK works
# standalone on a phone without Metro running. This is also where
# `EXPO_PUBLIC_*` env vars get inlined into the bundle.
working-directory: mobile
env:
EXPO_PUBLIC_SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
EXPO_PUBLIC_SENTRY_ENVIRONMENT: ${{ vars.SENTRY_ENVIRONMENT || 'production' }}
run: |
mkdir -p android/app/src/main/assets android/app/src/main/res
node node_modules/@expo/cli/build/bin/cli export:embed \
--platform android \
--dev false \
--bundle-output android/app/src/main/assets/index.android.bundle \
--assets-dest android/app/src/main/res
- name: Build debug APK
working-directory: mobile/android
run: ./gradlew assembleDebug --no-daemon
env:
NODE_OPTIONS: --max_old_space_size=4096
- name: Rename APK with version
id: rename
working-directory: mobile/android/app/build/outputs/apk/debug
run: |
REF_NAME="${{ github.ref_name }}"
# Sanitize ref → safe filename component
VERSION="${REF_NAME//[^a-zA-Z0-9._-]/_}"
mv app-debug.apk "goon-${VERSION}-debug.apk"
echo "apk=mobile/android/app/build/outputs/apk/debug/goon-${VERSION}-debug.apk" >> "$GITHUB_OUTPUT"
- name: Upload APK artifact
uses: actions/upload-artifact@v4
with:
name: goon-apk-${{ github.ref_name }}
path: ${{ steps.rename.outputs.apk }}
retention-days: 30
- name: Attach APK to GitHub Release
if: startsWith(github.ref, 'refs/tags/')
uses: softprops/action-gh-release@v2
with:
files: ${{ steps.rename.outputs.apk }}
fail_on_unmatched_files: true
generate_release_notes: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

77
.gitignore vendored Normal file
View file

@ -0,0 +1,77 @@
.env
.env.local
mobile/.env
mobile/.env.local
*.pyc
__pycache__/
.pytest_cache/
.ruff_cache/
.mypy_cache/
.coverage
htmlcov/
*.egg-info/
build/
dist/
.venv/
venv/
.idea/
.vscode/
*.sqlite
*.db
# Personal operational notes (deploy state, in-progress notes per session)
DEPLOY_BACKLOG.md
# Mobile (Expo / React Native)
mobile/node_modules/
mobile/.expo/
mobile/dist/
mobile/web-build/
mobile/android/.gradle/
mobile/android/app/build/
mobile/android/build/
mobile/ios/build/
mobile/ios/Pods/
mobile/*.jks
mobile/*.keystore
# Mobile build artefakty (regenerowane przy `gradlew assembleDebug` przez expo
# `export:embed`). NIE commitować — psuje rebuilds (gradle merguje stale bundle
# zamiast generować świeży, patrz session 2026-05-07).
mobile/android/app/src/main/assets/index.android.bundle
mobile/android/app/src/main/res/drawable-*/
mobile/android/app/src/main/res/raw/
# yt-dlp / scrapers cache
.yt-dlp-cache/
# Reverse-engineered third-party APKs (AIO Streamer dekompilacja — kept locally for
# debugging the legacy porn-app auth flow, but MUST NOT enter public git history;
# distributing decompiled proprietary code violates copyright/EULA).
re/
# DB dumps (operacyjne backupy, mogą zawierać user data)
*.dump
*.sql.gz
# Built APKs (release/debug binaries — distributed via GitHub Releases instead)
*.apk
# Claude Code session data (transcripts/agents — local only)
.claude/
# Operacyjne logi inputu / debug per-session
*.log
# Per-user runtime artefakty NIE do publicznego repo
.iclaude
wa-logs.txt
mcp-logs.txt
# ADB / development debug artefakty (screenshots, ui dumps)
.tmp_adb/
# Operational deploy scripts — moved to a private companion repo. Public repo
# should NOT contain SSH commands, systemd units, or smoke-test playbooks
# referencing concrete hosts.
deploy/

120
CONTRIBUTING.md Normal file
View file

@ -0,0 +1,120 @@
# Contributing to Goon
## Development setup
Goon backend is Python 3.12+, FastAPI + SQLAlchemy + APScheduler + Postgres.
Mobile client is React Native + Expo.
### Backend
```bash
# Create virtualenv
python -m venv .venv
. .venv/bin/activate # or .venv\Scripts\activate on Windows
# Install with dev extras
pip install -e .[dev]
# Bring up postgres (or use docker-compose; see README)
# Adjust DATABASE_URL in .env if needed
cp .env.example .env
# Run migrations
alembic upgrade head
# Run API
uvicorn app.main:app --reload --port 8000
# Run worker (separate terminal)
python -m app.scheduler.worker # full scheduler
python -m app.scheduler.worker --once --source=tpdb --limit=50 # one-shot ingest
```
### Mobile
```bash
cd mobile
npm install
npm start # opens Expo dev server
```
## Tests
```bash
pytest # full suite (~70 tests, <5s)
pytest tests/test_resolve_*.py -v
ruff check app/
mypy app/ # optional, CI-only
```
PRs must pass `pytest` + `ruff check`. Run them locally before pushing.
## Code style
- **Formatting**: ruff (config in `pyproject.toml`). Line length 100.
- **Type hints**: required on public functions. `from __future__ import annotations`
in every module.
- **Docstrings**: write the **why**, not the **what**. Reference real bugs/incidents
when explaining non-obvious code paths.
- **Comments**: only when the code can't speak for itself. Prefer renaming a
variable over adding a comment that explains it.
- **No dead code, no commented-out code, no TODO without an issue link.**
- **Polish or English in comments**: existing code is mostly Polish in
comments and English in code (function/class/var names). New code can be
either, but be consistent within a file.
## Adding a new tube extractor / scraper
If you want Goon to support an additional adult tube site:
1. **Stream extractor** (`app/extractors/tubes/`): given a scene page URL,
return a list of `StreamSource` (m3u8/mp4 URLs with quality labels).
- Mainstream tubes: try `_ytdlp.extract` (yt-dlp covers ~30 tubes out of
the box — just register the sitetag in `app/extractors/__init__.py`).
- WordPress-like tubes with embed iframe: register `_embed_iframe.extract`.
- Custom player / signed URLs / token rotation: write your own per-tube
module (see `hqporner.py`, `eporner.py`, `sxyprn.py` as references).
2. **Discovery scraper** (`app/connectors/direct_scrapers/`): subclass
`BaseSearchScraper`, set `sitetag`, `_search_url_template`, `_scene_url_re`.
Most aggregator tubes can fit in 10-20 lines (see `xmoviesforyou.py`).
3. **Register** the scraper class in `ALL_DIRECT_SCRAPERS` in
`app/connectors/direct_scrapers/__init__.py`.
4. **Test** with one performer name that you know has scenes on that tube:
```bash
python -m app.scheduler.worker --once --strategy=performer-driven \
--performers="Some Performer" --sitetags=<your-sitetag>
```
## Database migrations
Use Alembic:
```bash
alembic revision -m "describe change" # new migration
alembic upgrade head # apply
alembic downgrade -1 # roll back one
```
Every migration must have a working `downgrade()`. We don't ship squashed
migrations — full history is the source of truth.
## What we won't merge
- **Adult-content moderation features** (auto-tagging by detected acts,
content filtering by performer attributes, etc.) — out of scope.
- **Hardcoded credentials, API keys, or device IDs** in source — must be
env-driven.
- **Bypassing tube paywalls / DRM / auth** — Goon only scrapes publicly
accessible search pages.
- **Telemetry or analytics that report user activity to third parties**.
Sentry is opt-in (`SENTRY_DSN` empty by default).
- **Public deployment recipes** (e.g. nginx config for an open instance).
Goon is self-hosted only — see [DISCLAIMER.md](./DISCLAIMER.md).
## License
By contributing, you agree your contributions are licensed under the MIT
License (see [LICENSE](./LICENSE)).

62
DISCLAIMER.md Normal file
View file

@ -0,0 +1,62 @@
# Disclaimer
## Adult Content (18+)
Goon is a self-hosted aggregator for adult-content scene metadata. The software
itself contains no media — it indexes metadata from third-party sources
(TheporndB, StashDB, public adult tube sites) and links to those sources for
playback.
By using, hosting, or distributing this software you affirm that:
- You are at least 18 years of age (or the age of legal majority in your
jurisdiction, whichever is greater).
- Adult content is legal to view, store metadata about, and access in your
jurisdiction.
- You are solely responsible for compliance with all applicable laws,
including (but not limited to) record-keeping requirements (e.g. 18 U.S.C.
§ 2257 in the United States) and content classification rules.
## Self-Hosting Only
This software is intended for **self-hosting on infrastructure you control**.
Operating a public-facing instance accessible to unauthenticated users is
**not the intended use case** and may expose you to legal liability for
content delivery, age verification, and data protection.
If you operate a publicly accessible instance you are entirely responsible for
implementing the age verification, geo-restrictions, content moderation, ToS,
and privacy controls that your jurisdiction requires.
## Third-Party Sources
Goon scrapes publicly accessible search/listing pages from adult tube sites
to build its index. By configuring those scrapers and pointing them at a
target tube you accept that:
- Tube sites' Terms of Service may prohibit automated access. Respect their
rate limits and `robots.txt`. Goon does not bypass paywalls, authentication,
or DRM.
- Tube sites may at any time change their HTML, block your IP, or disable
features Goon depends on. Discovery and stream resolution are best-effort.
- The metadata Goon stores (titles, performer names, duration, thumbnails)
is sourced from those tubes and may contain inaccuracies, NSFW filenames,
or content the tube has since removed. Reporting takedown requests is your
responsibility — Goon ships no takedown workflow.
## No Warranty
This software is provided "AS IS" without warranty of any kind. See
[LICENSE](./LICENSE) for full terms. The authors and contributors are not
liable for any damages, losses, or legal consequences arising from use of
this software.
## Reporting Issues
For security issues affecting the software itself (auth bypass, RCE, secret
leak): open a private security advisory on the GitHub repository.
For takedown requests, content concerns, or jurisdiction-specific compliance
questions: contact the operator of the specific instance — Goon contributors
are not in a position to take action on third-party content surfaced by
self-hosted deployments.

22
Dockerfile Normal file
View file

@ -0,0 +1,22 @@
FROM python:3.12-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONPATH=/srv
WORKDIR /srv
RUN apt-get update \
&& apt-get install -y --no-install-recommends build-essential \
&& rm -rf /var/lib/apt/lists/*
COPY pyproject.toml ./
RUN pip install --upgrade pip \
&& pip install -e .[dev]
COPY app ./app
COPY alembic ./alembic
COPY alembic.ini ./
EXPOSE 8000

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 Goon contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

261
README.md Normal file
View file

@ -0,0 +1,261 @@
# Goon
Self-hosted aggregator for adult-content scene metadata. Indexes scenes from
TheporndB, StashDB, and 30+ public adult tube sites; deduplicates across
sources; serves an API + mobile (React Native) client for browsing and
linking out to playback.
> **18+ ONLY · Self-hosted only · See [DISCLAIMER.md](./DISCLAIMER.md) before
> hosting an instance.**
---
## What it does
- **Multi-source ingest**: pulls canonical scene/performer/studio metadata from
TPDB and StashDB on a delta cron, merges duplicates by performer + title +
date heuristics (perceptual hash + Levenshtein title distance).
- **Tube discovery**: per-performer search across 30+ public adult tube sites
(mainstream + aggregators). Each tube is scraped directly via HTTP — no
proprietary API dependencies.
- **Stream resolution on demand**: when a user clicks Watch, the API extracts
a fresh m3u8/mp4 URL from the tube's page (or falls back to embed link for
WebView playback). Mainstream tubes use yt-dlp; aggregator tubes use a
generic P.A.C.K.E.R. unpacker for JWPlayer-based hosters
(StreamWish/doodporn/mixdrop/...).
- **Mobile client** (Expo / React Native): scene grid, performer pages, watch
history, favorites, hold-to-preview animated thumbnails.
- **Performer-driven backfill**: a continuous worker walks performers ordered
by `last_searched_at NULLS FIRST` and back-fills tube scenes for the
longest-stale performer first.
## What it doesn't do
- Host or store any media. Scene metadata + thumbnail URLs only.
- Bypass paywalls, authentication, geo-blocks, or DRM.
- Provide age verification, ToS gating, or moderation for public deployments.
See [DISCLAIMER.md](./DISCLAIMER.md).
- Phone home. Sentry telemetry is opt-in (env var, empty by default).
---
## Quick start
### 1. Run the backend (Docker)
```bash
git clone <repo-url> goon
cd goon
cp .env.example .env
# Edit .env:
# - TPDB_API_TOKEN (theporndb.net account → API tokens)
# - STASHDB_API_KEY (stashdb.org account → API keys)
# - API_KEYS (generate one: python -c "import secrets; print(secrets.token_urlsafe(32))")
docker compose up -d
```
Three services come up: `db` (Postgres 16), `api` (FastAPI on `:8000`,
auto-applies migrations on startup), `worker` (APScheduler running TPDB/StashDB
delta + performer-driven backfill).
Verify: `curl localhost:8000/health``{"status":"ok"}`.
### 2. Install the mobile app (Android)
Download the latest debug APK from
[GitHub Releases](../../releases/latest) → `goon-vX.Y.Z-debug.apk`, install on
your Android device (allow "Install from unknown sources" for the browser /
file manager you used to download).
On first launch the app shows the age-gate disclaimer (must be accepted), then
a login screen. Enter:
- **Backend URL**: `http://<your-backend-host>:8000` (e.g. your LAN IP, or
`http://localhost:8000` if running on the device — uncommon)
- **API key**: one of the values you put in `API_KEYS` in `.env`
That's it.
### Local Python (no Docker)
```bash
python -m venv .venv && . .venv/bin/activate # or .\.venv\Scripts\activate on Windows
pip install -e .[dev]
cp .env.example .env # edit creds
alembic upgrade head
uvicorn app.main:app --port 8000
```
### Worker (manual one-shot ingest)
```bash
# Foreground APScheduler with all jobs
python -m app.scheduler.worker
# One-shot:
python -m app.scheduler.worker --once --source=tpdb --limit=200
python -m app.scheduler.worker --once --strategy=performer-driven --top-n=20
python -m app.scheduler.worker --once --strategy=performer-driven \
--performers="Lola Noir,Mia Malkova"
```
### Building the APK locally
```bash
cd mobile
npm install
cd android
./gradlew assembleDebug
# output: mobile/android/app/build/outputs/apk/debug/app-debug.apk
```
Or just push a `v*` tag — GitHub Actions builds and attaches the APK to the
Release ([.github/workflows/build-apk.yml](./.github/workflows/build-apk.yml)).
### Sentry telemetry (optional)
Default behavior: **no telemetry**. Sentry only initializes when a DSN is
present at runtime/build time.
To enable Sentry for **your** instance (errors only, no PII, no replay):
- **Backend**: set `SENTRY_DSN=https://...` in `.env` (gitignored).
Optionally `SENTRY_ENVIRONMENT=production` and `SENTRY_TRACES_SAMPLE_RATE=0.1`.
- **Mobile (local builds)**: create `mobile/.env` (gitignored) with
`EXPO_PUBLIC_SENTRY_DSN=https://...`. Expo SDK 49+ auto-inlines `EXPO_PUBLIC_*`
vars into the JS bundle at build time.
- **Mobile (CI builds)**: add a GitHub repository secret named `SENTRY_DSN`.
The APK workflow exports it as `EXPO_PUBLIC_SENTRY_DSN` to gradle. Without
the secret, the APK ships with telemetry disabled (forks of this repo don't
inherit your DSN).
Sentry init is gated by `if (SENTRY_DSN) { Sentry.init(...) }` — empty DSN
means the SDK is loaded as dead code but never sends a single request.
---
## Configuration
All runtime config is environment variables (see [.env.example](./.env.example)
for the full list). Highlights:
| Var | Default | Required? | Notes |
|---|---|---|---|
| `DATABASE_URL` | `postgresql+psycopg://goon:goon@localhost:5432/goon` | Yes | Postgres 14+ |
| `TPDB_API_TOKEN` | _empty_ | For TPDB ingest | Get from theporndb.net account |
| `STASHDB_API_KEY` | _empty_ | For StashDB ingest | Get from stashdb.org account |
| `API_KEYS` | _empty_ | Recommended | CSV of allowed API keys; empty = no auth (localhost-only) |
| `SENTRY_DSN` | _empty_ | No | Empty = no telemetry. Use your own DSN if you want crash reports. |
| `LOG_LEVEL` | `INFO` | No | DEBUG for verbose tube scraping logs |
Scheduler tuning (set to `0` to disable a job):
| Var | Default | Description |
|---|---|---|
| `GOON_SCHED_TPDB_HOURS` | `6` | TPDB delta interval |
| `GOON_SCHED_STASHDB_HOURS` | `6` | StashDB delta interval |
| `GOON_SCHED_PERFORMER_DRIVEN_HOURS` | `12` | Top-N performer ingest |
| `GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS` | `15` | Continuous backfill tick |
---
## Architecture (high level)
```
┌──────────┐ delta cron ┌────────────┐
│ TPDB │────────────────▶│ │
└──────────┘ │ │
┌──────────┐ │ ingest │
│ StashDB │────────────────▶│ pipeline │──┐
└──────────┘ │ │ │ cross-source
┌──────────┐ performer- │ │ ▼ dedup +
│ ~25 tube │ driven ┌───▶│ │ ┌─────────┐
│ sites │ search │ └────────────┘ │ Postgres│
└──────────┘────────────┘ └─────────┘
┌─────────────────────────────────────────────┤
▼ ▼
┌──────────────┐ ┌──────────────┐
│ FastAPI │ │ Worker │
│ /scenes │◀────── on Watch click ───│ scheduler │
│ /performers │ resolve stream URL │ (APScheduler)│
│ /playback │ (yt-dlp / hoster └──────────────┘
└──────────────┘ packer)
┌──────────────┐
│ Expo mobile │
│ (Android) │
└──────────────┘
```
Key modules:
- [`app/connectors/`](./app/connectors/) — TPDB, StashDB, dooplay (movies),
paradisehill (movies), [`direct_scrapers/`](./app/connectors/direct_scrapers/)
(25 tube discovery scrapers).
- [`app/extractors/`](./app/extractors/) — stream URL resolution per tube.
yt-dlp wrapper + custom + generic embed-iframe + P.A.C.K.E.R. unpacker.
- [`app/resolve/`](./app/resolve/) — cross-source scene merging (phash, title
similarity, performer overlap, release date window).
- [`app/scheduler/`](./app/scheduler/) — APScheduler jobs +
[`performer_driven.py`](./app/scheduler/performer_driven.py) (the core
ingest strategy: completeness > recency).
- [`mobile/`](./mobile/) — Expo / React Native client.
## Tube coverage
Discovery + stream resolution registered for ~33 sources:
**Mainstream tubes:** pornhub, redtube, xhamster, xvideos, xnxx, youporn,
eporner, hqporner, sxyprn, porntrex, pornhat.
**Aggregators / mirrors:** xmoviesforyou, watchporn, siska, porn4days,
porndish, xxxfreewatch, latestleaks, latestpornvideo, mypornerleak,
porndittcom, hdporn92, sxyland, 0dayxx, perverzija, fpoxxx, porn00, pornxp,
hdporngg, fullmovies, freshporno, shyfap.
**Movie sites:** paradisehill (primary) + dooplay mirrors (mangoporn,
streamporn, pandamovies).
If you want to add another tube, see [CONTRIBUTING.md](./CONTRIBUTING.md).
---
## Support the project
Goon is free, open-source, and ad-free. It stays that way because donations
cover the VPS, the TPDB/StashDB tokens, and the time. **Crypto only**
mainstream processors refuse adult projects, even FOSS tooling.
In-app: **Scenes → ♥** opens a screen with QR codes for Monero, Bitcoin, and
USDT (TRC-20).
Addresses are hardcoded in
[`mobile/src/lib/donate.ts`](./mobile/src/lib/donate.ts) so a compromised
server cannot swap them mid-donation. Verify the value on-screen against the
copy in this repo before sending.
---
## Roadmap
Near-term:
- Browse-by-performer + sort-by-studio
- Multi-tag filter (AND / OR)
- Continue-watching rail (position sync across devices)
- Stash local-server bridge — sync favorites/watchlist with a self-hosted Stash
- iOS sideload via TestFlight invite
Mid-term:
- Web companion (read-only browser frontend over the same API)
- BTCPay Server invoicing for one-time / recurring donations
- Performer-alert notifications (server push when a favorited performer drops a new scene)
---
## License
MIT — see [LICENSE](./LICENSE).

45
alembic.ini Normal file
View file

@ -0,0 +1,45 @@
[alembic]
script_location = alembic
prepend_sys_path = .
version_path_separator = os
file_template = %%(year)d%%(month).2d%%(day).2d_%%(hour).2d%%(minute).2d_%%(rev)s_%%(slug)s
[post_write_hooks]
hooks = ruff
ruff.type = console_scripts
ruff.entrypoint = ruff
ruff.options = format REVISION_SCRIPT_FILENAME
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

52
alembic/env.py Normal file
View file

@ -0,0 +1,52 @@
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
from app.config import get_settings
from app.models import Base
config = context.config
if config.config_file_name is not None:
fileConfig(config.config_file_name)
settings = get_settings()
config.set_main_option("sqlalchemy.url", settings.database_url)
target_metadata = Base.metadata
def run_migrations_offline() -> None:
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View file

@ -0,0 +1,3 @@
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pgcrypto;

25
alembic/script.py.mako Normal file
View file

@ -0,0 +1,25 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: str | None = ${repr(down_revision)}
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View file

@ -0,0 +1,313 @@
"""initial schema
Revision ID: 0001_initial
Revises:
Create Date: 2026-05-02
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0001_initial"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# `create_type=False` bo enum tworzymy raz jawnie poniżej; używanie tej samej instancji
# w wielu kolumnach z `create_type=True` próbowałoby tworzyć typ wielokrotnie.
SOURCE_KIND = postgresql.ENUM(
"tpdb", "stashdb", "scraper", "porn_app", "manual",
name="source_kind", create_type=False,
)
ENTITY_KIND = postgresql.ENUM(
"scene", "performer", "studio", "tag",
name="entity_kind", create_type=False,
)
PERFORMER_GENDER = postgresql.ENUM(
"female", "male", "transgender_female", "transgender_male",
"non_binary", "intersex", "unknown",
name="performer_gender", create_type=False,
)
FINGERPRINT_KIND = postgresql.ENUM(
"phash", "oshash", "md5", name="fingerprint_kind", create_type=False,
)
MERGE_KIND = postgresql.ENUM(
"scene", "performer", "studio", name="merge_kind", create_type=False,
)
MERGE_STATUS = postgresql.ENUM(
"pending", "auto_merged", "merged", "rejected",
name="merge_status", create_type=False,
)
INGEST_STATUS = postgresql.ENUM(
"running", "success", "partial", "failed",
name="ingest_status", create_type=False,
)
def upgrade() -> None:
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
op.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto;")
SOURCE_KIND.create(op.get_bind(), checkfirst=True)
ENTITY_KIND.create(op.get_bind(), checkfirst=True)
PERFORMER_GENDER.create(op.get_bind(), checkfirst=True)
FINGERPRINT_KIND.create(op.get_bind(), checkfirst=True)
MERGE_KIND.create(op.get_bind(), checkfirst=True)
MERGE_STATUS.create(op.get_bind(), checkfirst=True)
INGEST_STATUS.create(op.get_bind(), checkfirst=True)
op.create_table(
"sources",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("kind", SOURCE_KIND, nullable=False),
sa.Column("name", sa.String(128), nullable=False, unique=True),
sa.Column("base_url", sa.String(512)),
sa.Column("auth_secret_ref", sa.String(128)),
sa.Column("weight", sa.Float, nullable=False, server_default="1.0"),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_table(
"studios",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("name", sa.String(256), nullable=False),
sa.Column("name_normalized", sa.String(256), nullable=False),
sa.Column("slug", sa.String(256), nullable=False, unique=True),
sa.Column("parent_studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
sa.Column("network", sa.String(256)),
sa.Column("homepage_url", sa.String(512)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_studios_name_normalized", "studios", ["name_normalized"])
op.execute(
"CREATE INDEX ix_studios_name_normalized_trgm ON studios "
"USING GIN (name_normalized gin_trgm_ops);"
)
op.create_table(
"studio_aliases",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="CASCADE"), nullable=False),
sa.Column("alias", sa.String(256), nullable=False),
sa.Column("alias_normalized", sa.String(256), nullable=False),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
sa.UniqueConstraint("studio_id", "alias_normalized", name="uq_studio_aliases_studio_id_alias_normalized"),
)
op.create_index("ix_studio_aliases_studio_id", "studio_aliases", ["studio_id"])
op.create_index("ix_studio_aliases_alias_normalized", "studio_aliases", ["alias_normalized"])
op.execute(
"CREATE INDEX ix_studio_aliases_alias_normalized_trgm ON studio_aliases "
"USING GIN (alias_normalized gin_trgm_ops);"
)
op.create_table(
"studio_external_refs",
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
sa.Column("external_id", sa.String(256), primary_key=True),
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="CASCADE"), nullable=False),
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_studio_external_refs_studio_id", "studio_external_refs", ["studio_id"])
op.create_table(
"performers",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("canonical_name", sa.String(256), nullable=False),
sa.Column("name_normalized", sa.String(256), nullable=False),
sa.Column("slug", sa.String(256), nullable=False, unique=True),
sa.Column("gender", PERFORMER_GENDER),
sa.Column("birth_date", sa.Date),
sa.Column("country", sa.String(64)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_performers_name_normalized", "performers", ["name_normalized"])
op.execute(
"CREATE INDEX ix_performers_name_normalized_trgm ON performers "
"USING GIN (name_normalized gin_trgm_ops);"
)
op.create_table(
"performer_aliases",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), nullable=False),
sa.Column("alias", sa.String(256), nullable=False),
sa.Column("alias_normalized", sa.String(256), nullable=False),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
sa.UniqueConstraint("performer_id", "alias_normalized", name="uq_performer_aliases_performer_id_alias_normalized"),
)
op.create_index("ix_performer_aliases_performer_id", "performer_aliases", ["performer_id"])
op.create_index("ix_performer_aliases_alias_normalized", "performer_aliases", ["alias_normalized"])
op.execute(
"CREATE INDEX ix_performer_aliases_alias_normalized_trgm ON performer_aliases "
"USING GIN (alias_normalized gin_trgm_ops);"
)
op.create_table(
"performer_external_refs",
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
sa.Column("external_id", sa.String(256), primary_key=True),
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), nullable=False),
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_performer_external_refs_performer_id", "performer_external_refs", ["performer_id"])
op.create_table(
"tags",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("name", sa.String(128), nullable=False),
sa.Column("slug", sa.String(128), nullable=False, unique=True),
sa.Column("parent_tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="SET NULL")),
sa.Column("description", sa.String(1024)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_table(
"scenes",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("title", sa.String(512), nullable=False),
sa.Column("title_normalized", sa.String(512), nullable=False),
sa.Column("slug", sa.String(512)),
sa.Column("release_date", sa.Date),
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
sa.Column("duration_sec", sa.Integer),
sa.Column("description", sa.Text),
sa.Column("code", sa.String(128)),
sa.Column("director", sa.String(256)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_scenes_title_normalized", "scenes", ["title_normalized"])
op.execute(
"CREATE INDEX ix_scenes_title_normalized_trgm ON scenes "
"USING GIN (title_normalized gin_trgm_ops);"
)
op.create_index("ix_scenes_release_date", "scenes", ["release_date"])
op.create_index("ix_scenes_slug", "scenes", ["slug"])
op.create_index("ix_scenes_studio_id", "scenes", ["studio_id"])
op.create_index("ix_scenes_code", "scenes", ["code"])
op.create_index("ix_scenes_studio_release_date", "scenes", ["studio_id", "release_date"])
op.create_table(
"scene_external_refs",
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
sa.Column("external_id", sa.String(256), primary_key=True),
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), nullable=False),
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
sa.Column("url", sa.String(1024)),
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_scene_external_refs_scene_id", "scene_external_refs", ["scene_id"])
op.create_table(
"scene_fingerprints",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), nullable=False),
sa.Column("kind", FINGERPRINT_KIND, nullable=False),
sa.Column("value", sa.String(128), nullable=False),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
sa.UniqueConstraint("scene_id", "kind", "value", name="uq_scene_fingerprints_scene_id_kind_value"),
)
op.create_index("ix_scene_fingerprints_scene_id", "scene_fingerprints", ["scene_id"])
op.create_index("ix_scene_fingerprints_value", "scene_fingerprints", ["value"])
op.create_table(
"scene_performers",
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), primary_key=True),
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), primary_key=True),
sa.Column("role", sa.String(64)),
sa.Column("position", sa.Integer),
sa.Column("as_alias", sa.String(256)),
)
op.create_table(
"scene_tags",
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), primary_key=True),
sa.Column("tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="CASCADE"), primary_key=True),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
)
op.create_table(
"external_records",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False),
sa.Column("entity_kind", ENTITY_KIND, nullable=False),
sa.Column("external_id", sa.String(256), nullable=False),
sa.Column("raw", postgresql.JSONB, nullable=False),
sa.Column("raw_hash", sa.LargeBinary(32), nullable=False),
sa.Column("fetched_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("last_seen_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.UniqueConstraint("source_id", "entity_kind", "external_id", name="uq_external_records_source_id_entity_kind_external_id"),
)
op.create_index("ix_external_records_source_id", "external_records", ["source_id"])
op.create_table(
"merge_candidates",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("kind", MERGE_KIND, nullable=False),
sa.Column("left_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column("right_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column("score", sa.Float, nullable=False),
sa.Column("reasons", postgresql.JSONB, nullable=False, server_default="{}"),
sa.Column("status", MERGE_STATUS, nullable=False, server_default="pending"),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("resolved_at", sa.DateTime(timezone=True)),
sa.Column("resolved_by", sa.String(128)),
)
op.create_index("ix_merge_candidates_left_id", "merge_candidates", ["left_id"])
op.create_index("ix_merge_candidates_right_id", "merge_candidates", ["right_id"])
op.create_index("ix_merge_candidates_status", "merge_candidates", ["status"])
op.create_table(
"ingest_runs",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False),
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("finished_at", sa.DateTime(timezone=True)),
sa.Column("status", INGEST_STATUS, nullable=False, server_default="running"),
sa.Column("records_seen", sa.Integer, nullable=False, server_default="0"),
sa.Column("records_new", sa.Integer, nullable=False, server_default="0"),
sa.Column("records_updated", sa.Integer, nullable=False, server_default="0"),
sa.Column("errors", postgresql.JSONB),
)
op.create_index("ix_ingest_runs_source_id", "ingest_runs", ["source_id"])
def downgrade() -> None:
op.drop_table("ingest_runs")
op.drop_table("merge_candidates")
op.drop_table("external_records")
op.drop_table("scene_tags")
op.drop_table("scene_performers")
op.drop_table("scene_fingerprints")
op.drop_table("scene_external_refs")
op.drop_table("scenes")
op.drop_table("tags")
op.drop_table("performer_external_refs")
op.drop_table("performer_aliases")
op.drop_table("performers")
op.drop_table("studio_external_refs")
op.drop_table("studio_aliases")
op.drop_table("studios")
op.drop_table("sources")
INGEST_STATUS.drop(op.get_bind(), checkfirst=True)
MERGE_STATUS.drop(op.get_bind(), checkfirst=True)
MERGE_KIND.drop(op.get_bind(), checkfirst=True)
FINGERPRINT_KIND.drop(op.get_bind(), checkfirst=True)
PERFORMER_GENDER.drop(op.get_bind(), checkfirst=True)
ENTITY_KIND.drop(op.get_bind(), checkfirst=True)
SOURCE_KIND.drop(op.get_bind(), checkfirst=True)

View file

@ -0,0 +1,67 @@
"""playback_sources table for tube/aggregator video links
Revision ID: 0002_playback_sources
Revises: 0001_initial
Create Date: 2026-05-02
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0002_playback_sources"
down_revision: str | None = "0001_initial"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"playback_sources",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
server_default=sa.text("gen_random_uuid()"),
),
sa.Column(
"scene_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("origin", sa.String(64), nullable=False),
sa.Column("page_url", sa.String(2048), nullable=False),
sa.Column("embed_url", sa.String(2048)),
sa.Column("stream_url", sa.String(2048)),
sa.Column("quality", sa.String(16)),
sa.Column("duration_sec", sa.Integer),
sa.Column("thumbnail_url", sa.String(2048)),
sa.Column(
"last_seen_at",
sa.DateTime(timezone=True),
server_default=sa.text("NOW()"),
nullable=False,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("NOW()"),
nullable=False,
),
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.text("NOW()"),
nullable=False,
),
sa.UniqueConstraint("origin", "page_url", name="uq_playback_sources_origin_page_url"),
)
op.create_index("ix_playback_sources_scene_id", "playback_sources", ["scene_id"])
op.create_index("ix_playback_sources_origin", "playback_sources", ["origin"])
def downgrade() -> None:
op.drop_table("playback_sources")

View file

@ -0,0 +1,41 @@
"""playback_sources.dead_at + dead_reason — flagging dead tube links
Revision ID: 0003_playback_dead
Revises: 0002_playback_sources
Create Date: 2026-05-03
Gdy resolve endpoint dostanie 404 "Video is offline" / "deleted" z porn-app,
oznaczamy ten playback_source jako martwy. API filtruje go z `_build_scene_out`,
mobile go nie pokazuje. has_playback=true filter też wymaga `dead_at IS NULL`.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0003_playback_dead"
down_revision: str | None = "0002_playback_sources"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"playback_sources",
sa.Column("dead_at", sa.DateTime(timezone=True), nullable=True),
)
op.add_column(
"playback_sources",
sa.Column("dead_reason", sa.String(length=512), nullable=True),
)
op.create_index(
"ix_playback_sources_dead_at",
"playback_sources",
["dead_at"],
)
def downgrade() -> None:
op.drop_index("ix_playback_sources_dead_at", table_name="playback_sources")
op.drop_column("playback_sources", "dead_reason")
op.drop_column("playback_sources", "dead_at")

View file

@ -0,0 +1,34 @@
"""playback_sources.animated_thumbnail_url — animowane miniaturki dla hold-to-preview
Revision ID: 0004_animated_thumbnail
Revises: 0003_playback_dead
Create Date: 2026-05-04
Mobile (`ScenesScreen`, `MergeQueueScreen`) ma hold-to-preview: po przytrzymaniu kciuka
na thumbie pokazuje animowany webp/gif zamiast statycznego obrazka. Pole jest opcjonalne
nie każde źródło tube je dostarcza; jeśli null mobile fallbackuje do `thumbnail_url`.
Bez tej kolumny endpointy które zwracały (admin merge-candidates, scene detail) musiały
być sztucznie ograniczane (vide DEPLOY_BACKLOG.md). Po tej migracji można wrócić do
pełnej projekcji w `app/api/admin.py`.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0004_animated_thumbnail"
down_revision: str | None = "0003_playback_dead"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"playback_sources",
sa.Column("animated_thumbnail_url", sa.String(length=2048), nullable=True),
)
def downgrade() -> None:
op.drop_column("playback_sources", "animated_thumbnail_url")

View file

@ -0,0 +1,38 @@
"""favorite_performers — ulubione performerki (single-user, in-app)
Revision ID: 0005_favorite_performers
Revises: 0004_animated_thumbnail
Create Date: 2026-05-04
Single-user system (brak users), więc tabelka to po prostu zbiór performer_id które
user oznaczył jako ulubione, plus `last_seen_at` żeby mobile mogło policzyć ile nowych
scen pojawiło się od ostatniego oglądania (badge w toolbar/Favorites screen).
Multi-user można dodać potem (kolumna user_id + composite PK), bez breaking change.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0005_favorite_performers"
down_revision: str | None = "0004_animated_thumbnail"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"favorite_performers",
sa.Column("performer_id", sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("performers.id", ondelete="CASCADE"),
primary_key=True),
sa.Column("created_at", sa.DateTime(timezone=True),
server_default=sa.func.now(), nullable=False),
sa.Column("last_seen_at", sa.DateTime(timezone=True),
server_default=sa.func.now(), nullable=False),
)
def downgrade() -> None:
op.drop_table("favorite_performers")

View file

@ -0,0 +1,41 @@
"""Blacklists — performers/studios/tags do globalnego ukrywania.
Revision ID: 0006_blacklists
Revises: 0005_favorite_performers
Create Date: 2026-05-04
Single-user; analogicznie do favorite_performers ale negative sceny które MAJĄ
blacklisted performer / studio / tag wykluczane ze wszystkich list (scenes,
search, performer/tag scenes).
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0006_blacklists"
down_revision: str | None = "0005_favorite_performers"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
for tbl, parent_tbl, parent_col in [
("blacklisted_performers", "performers", "performer_id"),
("blacklisted_studios", "studios", "studio_id"),
("blacklisted_tags", "tags", "tag_id"),
]:
op.create_table(
tbl,
sa.Column(parent_col, sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey(f"{parent_tbl}.id", ondelete="CASCADE"),
primary_key=True),
sa.Column("created_at", sa.DateTime(timezone=True),
server_default=sa.func.now(), nullable=False),
)
def downgrade() -> None:
op.drop_table("blacklisted_tags")
op.drop_table("blacklisted_studios")
op.drop_table("blacklisted_performers")

View file

@ -0,0 +1,42 @@
"""scene_play_progress — pozycja odtwarzania per scena (continue watching).
Revision ID: 0007_play_progress
Revises: 0006_blacklists
Create Date: 2026-05-04
Single-user; tabela trzyma ostatnio oglądane sceny + (gdy player zwróci) pozycję
w sekundach. Continue watching rail na home pobiera top-N ostatnich.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0007_play_progress"
down_revision: str | None = "0006_blacklists"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"scene_play_progress",
sa.Column("scene_id", sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
primary_key=True),
sa.Column("position_sec", sa.Integer(), nullable=False, server_default="0"),
sa.Column("duration_sec", sa.Integer(), nullable=True),
sa.Column("finished", sa.Boolean(), nullable=False, server_default=sa.false()),
sa.Column("last_played_at", sa.DateTime(timezone=True),
server_default=sa.func.now(), nullable=False),
)
op.create_index(
"ix_scene_play_progress_last_played_at",
"scene_play_progress",
["last_played_at"],
)
def downgrade() -> None:
op.drop_index("ix_scene_play_progress_last_played_at", table_name="scene_play_progress")
op.drop_table("scene_play_progress")

View file

@ -0,0 +1,42 @@
"""Performer.last_searched_at + search_run_count — backfill queue dla per-performer search.
Revision ID: 0008_performer_search_meta
Revises: 0007_play_progress
Create Date: 2026-05-06
Continuous worker iteruje performerów ORDER BY last_searched_at NULLS FIRST,
search_run_count ASC. Performerów którzy nigdy nie byli searchowani idą pierwsi.
Po pełnym sweep'ie kolejka cyklicznie wraca do najstarszych.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0008_performer_search_meta"
down_revision: str | None = "0007_play_progress"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"performers",
sa.Column("last_searched_at", sa.DateTime(timezone=True), nullable=True),
)
op.add_column(
"performers",
sa.Column("search_run_count", sa.Integer(), nullable=False, server_default="0"),
)
# Index dla queue: NULLS FIRST + search_run_count ASC. PostgreSQL btree
# default DESC ma NULLS FIRST. Asc - NULLS LAST. Robimy explicit.
op.execute(
"CREATE INDEX ix_performers_search_priority "
"ON performers (last_searched_at ASC NULLS FIRST, search_run_count ASC)"
)
def downgrade() -> None:
op.drop_index("ix_performers_search_priority", table_name="performers")
op.drop_column("performers", "search_run_count")
op.drop_column("performers", "last_searched_at")

View file

@ -0,0 +1,146 @@
"""movies kanon + bliźniacze tabele do scen
Revision ID: 0009_movies
Revises: 0008_performer_search_meta
Create Date: 2026-05-06
Schema dla full-length adult films (paradisehill + mirrory). Movies różnią się od
scen: 60-180min runtime, multi-chapter struktura, więcej metadanych (director,
year, country, rating). Performers/studios/tags reusable (te same osoby/studia
występują w scenach i w filmach).
Nowe entity_kind: 'movie'. Nowe merge_kind: 'movie'. Movie-fingerprints rzadko
istnieją (movies nie mają standardowego pHash w industry), więc fingerprint table
pomijamy dedup pójdzie po composite key (title+year+studio+cast Jaccard).
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0009_movies"
down_revision: str | None = "0008_performer_search_meta"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# Rozszerz enumy o 'movie'
op.execute("ALTER TYPE entity_kind ADD VALUE IF NOT EXISTS 'movie'")
op.execute("ALTER TYPE merge_kind ADD VALUE IF NOT EXISTS 'movie'")
op.create_table(
"movies",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("title", sa.String(512), nullable=False),
sa.Column("title_normalized", sa.String(512), nullable=False),
sa.Column("slug", sa.String(512)),
sa.Column("release_year", sa.Integer),
sa.Column("release_date", sa.Date),
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
sa.Column("director", sa.String(256)),
sa.Column("country", sa.String(64)),
sa.Column("duration_sec", sa.Integer),
sa.Column("description", sa.Text),
sa.Column("poster_url", sa.String(2048)),
sa.Column("backdrop_url", sa.String(2048)),
# Rating jako float (paradisehill ma like_count + rating 0-10; trzymamy
# uśredniony rating z primary source'a, jeśli dostępny).
sa.Column("rating", sa.Float),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_movies_title_normalized", "movies", ["title_normalized"])
op.execute(
"CREATE INDEX ix_movies_title_normalized_trgm ON movies "
"USING GIN (title_normalized gin_trgm_ops);"
)
op.create_index("ix_movies_release_year", "movies", ["release_year"])
op.create_index("ix_movies_release_date", "movies", ["release_date"])
op.create_index("ix_movies_slug", "movies", ["slug"])
op.create_index("ix_movies_studio_id", "movies", ["studio_id"])
op.create_index("ix_movies_studio_year", "movies", ["studio_id", "release_year"])
op.create_table(
"movie_external_refs",
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
sa.Column("external_id", sa.String(256), primary_key=True),
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
sa.Column("url", sa.String(1024)),
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
)
op.create_index("ix_movie_external_refs_movie_id", "movie_external_refs", ["movie_id"])
op.create_table(
"movie_performers",
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), primary_key=True),
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), primary_key=True),
sa.Column("role", sa.String(64)),
sa.Column("position", sa.Integer),
sa.Column("as_alias", sa.String(256)),
)
op.create_table(
"movie_tags",
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), primary_key=True),
sa.Column("tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="CASCADE"), primary_key=True),
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
)
# Chaptery — opcjonalna tabela dla filmów rozbitych na sceny/segmenty
# (paradisehill czasem ma timestamp markers, np. "Scene 1: 00:00-15:32").
# Każdy chapter MOŻE linkować do istniejącego Scene (jeśli ta scena też jest
# samodzielnie znana z TPDB/StashDB), albo żyje tylko jako anchor w movie.
op.create_table(
"movie_chapters",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
sa.Column("chapter_index", sa.Integer, nullable=False),
sa.Column("title", sa.String(512)),
sa.Column("start_sec", sa.Integer),
sa.Column("end_sec", sa.Integer),
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="SET NULL")),
sa.UniqueConstraint("movie_id", "chapter_index", name="uq_movie_chapters_movie_id_chapter_index"),
)
op.create_index("ix_movie_chapters_movie_id", "movie_chapters", ["movie_id"])
# Playback sources dla movies — analog do playback_sources, oddzielna tabela
# bo nie chcemy mieszać scene_id/movie_id w jednym FK column. Reuse origin
# konwencji ('paradisehill', 'psyplay:streamporn', 'wp_movies:speedporn', itp.).
op.create_table(
"movie_playback_sources",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
sa.Column("origin", sa.String(64), nullable=False),
sa.Column("page_url", sa.String(2048), nullable=False),
sa.Column("embed_url", sa.String(2048)),
sa.Column("stream_url", sa.String(2048)),
sa.Column("quality", sa.String(16)),
sa.Column("duration_sec", sa.Integer),
sa.Column("thumbnail_url", sa.String(2048)),
sa.Column("animated_thumbnail_url", sa.String(2048)),
sa.Column("last_seen_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("dead_at", sa.DateTime(timezone=True)),
sa.Column("dead_reason", sa.String(512)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
sa.UniqueConstraint("origin", "page_url", name="uq_movie_playback_sources_origin_page_url"),
)
op.create_index("ix_movie_playback_sources_movie_id", "movie_playback_sources", ["movie_id"])
op.create_index("ix_movie_playback_sources_origin", "movie_playback_sources", ["origin"])
op.create_index("ix_movie_playback_sources_dead_at", "movie_playback_sources", ["dead_at"])
def downgrade() -> None:
op.drop_table("movie_playback_sources")
op.drop_table("movie_chapters")
op.drop_table("movie_tags")
op.drop_table("movie_performers")
op.drop_table("movie_external_refs")
op.drop_table("movies")
# Postgres nie pozwala usuwać wartości z enum-a w prosty sposób — zostawiamy
# 'movie' w entity_kind / merge_kind. Niewielki overhead w katalogu enum-ów
# (rząd bajtów per typ), bezpieczniejsze niż próby DROP VALUE.

View file

@ -0,0 +1,39 @@
"""favorite_scenes table
Revision ID: 0010_favorite_scenes
Revises: 0009_movies
Create Date: 2026-05-06
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0010_favorite_scenes"
down_revision: str | None = "0009_movies"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"favorite_scenes",
sa.Column(
"scene_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("NOW()"),
nullable=False,
),
)
op.create_index("ix_favorite_scenes_created_at", "favorite_scenes", ["created_at"])
def downgrade() -> None:
op.drop_table("favorite_scenes")

View file

@ -0,0 +1,60 @@
"""playback_sources.origin: rename `pornapp:*` → `tube:*`
Revision ID: 0011_origin_pornapp_to_tube
Revises: 0010_favorite_scenes
Create Date: 2026-05-07
Po usunięciu zależności od porn-app.com API, prefix `pornapp:` w `playback_sources.origin`
jest myląca historyczna nazwa discovery + stream resolve teraz idzie bezpośrednio przez
direct scrapery i `app.extractors`. Zmieniamy prefix na neutralny `tube:` żeby nazwa
odzwierciedlała architekturę (sitetag pozostaje bez zmian `tube:hqpornercom` itd.).
Idempotent: WHERE klauzula zapobiega podwójnemu rename. Operuje też na
`movie_playback_sources` (analogiczna kolumna z M5 movies).
Backend `app/api/playback.py` rozumie oba prefixy (`pornapp:` legacy + `tube:`)
podczas okresu transition po tej migracji można pozostawić tylko `tube:` sprawdzenie.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0011_origin_pornapp_to_tube"
down_revision: str | None = "0010_favorite_scenes"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute(
sa.text(
"UPDATE playback_sources "
"SET origin = 'tube:' || SUBSTRING(origin FROM 9) "
"WHERE origin LIKE 'pornapp:%'"
)
)
op.execute(
sa.text(
"UPDATE movie_playback_sources "
"SET origin = 'tube:' || SUBSTRING(origin FROM 9) "
"WHERE origin LIKE 'pornapp:%'"
)
)
def downgrade() -> None:
op.execute(
sa.text(
"UPDATE playback_sources "
"SET origin = 'pornapp:' || SUBSTRING(origin FROM 6) "
"WHERE origin LIKE 'tube:%'"
)
)
op.execute(
sa.text(
"UPDATE movie_playback_sources "
"SET origin = 'pornapp:' || SUBSTRING(origin FROM 6) "
"WHERE origin LIKE 'tube:%'"
)
)

View file

@ -0,0 +1,48 @@
"""favorite_studios — ulubione studia (single-user, in-app)
Revision ID: 0012_favorite_studios
Revises: 0011_origin_pornapp_to_tube
Create Date: 2026-05-08
Mirror `favorite_performers` ze studio_id zamiast performer_id. Single-user, więc
tabelka to po prostu zbiór studio_id które user oznaczył jako ulubione, plus
`last_seen_at` mobile liczy ile nowych scen pojawiło się w danym studio od
ostatniego oglądania (badge w Favorites).
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0012_favorite_studios"
down_revision: str | None = "0011_origin_pornapp_to_tube"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"favorite_studios",
sa.Column(
"studio_id",
sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("studios.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
sa.Column(
"last_seen_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
def downgrade() -> None:
op.drop_table("favorite_studios")

View file

@ -0,0 +1,52 @@
"""bug_reports — in-app bug reporting (mobile FAB → POST /bug-reports)
Revision ID: 0013_bug_reports
Revises: 0012_favorite_studios
Create Date: 2026-05-09
User wpisuje opis + appka kapturuje screen (react-native-view-shot omija
FLAG_SECURE) wysyła POST. Backend trzyma w tabeli, admin_html ma listę.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0013_bug_reports"
down_revision: str | None = "0012_favorite_studios"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"bug_reports",
sa.Column(
"id",
sa.dialects.postgresql.UUID(as_uuid=True),
primary_key=True,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
sa.Column("screen_name", sa.String(64), nullable=True),
sa.Column("app_version", sa.String(32), nullable=True),
sa.Column(
"scene_id",
sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("scenes.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column("message", sa.Text, nullable=False),
sa.Column("screenshot_b64", sa.Text, nullable=True),
sa.Column("resolved", sa.Boolean, nullable=False, server_default=sa.false()),
)
op.create_index("ix_bug_reports_created_at", "bug_reports", ["created_at"])
def downgrade() -> None:
op.drop_index("ix_bug_reports_created_at", table_name="bug_reports")
op.drop_table("bug_reports")

View file

@ -0,0 +1,46 @@
"""favorite_movies — single-user favorites + last_seen_at dla NEW badge.
Revision ID: 0014_favorite_movies
Revises: 0013_bug_reports
Create Date: 2026-05-09
Mirror `favorite_studios` z movie_id zamiast studio_id. NEW badge w mobile
liczone client-side: movie.created_at > favorite.last_seen_at.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0014_favorite_movies"
down_revision: str | None = "0013_bug_reports"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"favorite_movies",
sa.Column(
"movie_id",
sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("movies.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
sa.Column(
"last_seen_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
def downgrade() -> None:
op.drop_table("favorite_movies")

View file

@ -0,0 +1,38 @@
"""bug_reports — dodaj movie_id (FK movies, nullable)
Revision ID: 0015_bug_reports_movie_id
Revises: 0014_favorite_movies
Create Date: 2026-05-10
Mobile Player przekazuje movie_id w nav params jako `sceneId` (legacy hack na
progress tracking, który dla movies zwraca 404 i mobile to ignoruje). Bug-report
flow inserted to przy POST jako scene_id, FK violation crash 500.
Fix: rozszerz tabelę o movie_id, backend smart-routes po lookup (jeśli scene_id
nie istnieje w scenes ALE istnieje w movies, zapisz jako movie_id).
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0015_bug_reports_movie_id"
down_revision: str | None = "0014_favorite_movies"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"bug_reports",
sa.Column(
"movie_id",
sa.dialects.postgresql.UUID(as_uuid=True),
sa.ForeignKey("movies.id", ondelete="SET NULL"),
nullable=True,
),
)
def downgrade() -> None:
op.drop_column("bug_reports", "movie_id")

View file

@ -0,0 +1,44 @@
"""realdebrid_cache — direct stream URL cache dla RD /unrestrict/link wyników
Revision ID: 0016_realdebrid_cache
Revises: 0015_bug_reports_movie_id
Create Date: 2026-05-12
RD direct linki technically valid ~7 dni, ale cache'ujemy 24h (configurable
przez RD_CACHE_TTL_HOURS) żeby oszczędzać API quota przy replay tej samej
sceny.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0016_realdebrid_cache"
down_revision: str | None = "0015_bug_reports_movie_id"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.create_table(
"realdebrid_cache",
sa.Column("hoster_url", sa.Text(), primary_key=True),
sa.Column("direct_url", sa.Text(), nullable=False),
sa.Column(
"created_at",
sa.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
)
op.create_index(
"ix_realdebrid_cache_expires_at",
"realdebrid_cache",
["expires_at"],
)
def downgrade() -> None:
op.drop_index("ix_realdebrid_cache_expires_at", table_name="realdebrid_cache")
op.drop_table("realdebrid_cache")

View file

@ -0,0 +1,37 @@
"""drop realdebrid_cache table — RD nie wykorzystywany (Hetzner IP blocked)
Revision ID: 0017_drop_realdebrid_cache
Revises: 0016_realdebrid_cache
Create Date: 2026-05-12
Real-Debrid integration cofnięta Hetzner VPS IP blokowany globalnie przez
RD anti-abuse, a 95% relevantnych hosterów (streamtape/playmogo/dood/mixdrop/
filemoon/iceyfile) DOWN lub UNSUPPORTED w RD list. Tylko voe.sx + file
hosters UP, nie pokrywa naszego streaming use case.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0017_drop_realdebrid_cache"
down_revision: str | None = "0016_realdebrid_cache"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.drop_index("ix_realdebrid_cache_expires_at", table_name="realdebrid_cache")
op.drop_table("realdebrid_cache")
def downgrade() -> None:
op.create_table(
"realdebrid_cache",
sa.Column("hoster_url", sa.Text(), primary_key=True),
sa.Column("direct_url", sa.Text(), nullable=False),
sa.Column("created_at", sa.TIMESTAMP(timezone=True), nullable=False,
server_default=sa.text("now()")),
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
)
op.create_index("ix_realdebrid_cache_expires_at", "realdebrid_cache", ["expires_at"])

0
app/__init__.py Normal file
View file

0
app/api/__init__.py Normal file
View file

332
app/api/admin.py Normal file
View file

@ -0,0 +1,332 @@
"""Admin API: lista pending merge candidates + side-by-side detail + resolve."""
from __future__ import annotations
import uuid
from typing import Annotated, Literal
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, ConfigDict
from sqlalchemy import func, select
from sqlalchemy.orm import Session
from app.api.scenes import _build_scene_out
from app.api.schemas import SceneOut
from app.auth import require_api_key
from app.db import get_session
from app.models.external_record import ExternalRecord
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneExternalRef
from app.models.source import Source, SourceKind
from app.resolve.scene_merge import MergeError, resolve_candidate
router = APIRouter(
prefix="/admin",
tags=["admin"],
dependencies=[Depends(require_api_key)],
)
def _raw_to_thumb(raw: dict, kind: SourceKind) -> str | None:
"""Wyciąga thumbnail URL z external_records.raw dla danego źródła.
TPDB ma `image`/`poster`/`background.large`. StashDB raw nie zawiera image
(osobny query do StashDB potrzebny tu zwracamy None)."""
if kind == SourceKind.tpdb:
for k in ("image", "poster"):
v = raw.get(k)
if isinstance(v, str) and v.startswith("http"):
return v
bg = raw.get("background")
if isinstance(bg, dict):
v = bg.get("large") or bg.get("medium") or bg.get("full")
if isinstance(v, str) and v.startswith("http"):
return v
elif kind == SourceKind.stashdb:
# StashDB scene response includes images via separate query — nie trzymamy
# tego w raw obecnie. TODO: dorzucić mirror do `paths.screenshot` przy ingest.
paths = raw.get("paths")
if isinstance(paths, dict):
for k in ("screenshot", "image", "preview"):
v = paths.get(k)
if isinstance(v, str) and v.startswith("http"):
return v
return None
# ---- schemas --------------------------------------------------------------
class MergeCandidateSummary(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
kind: str
left_id: uuid.UUID
right_id: uuid.UUID
score: float
status: str
left_title: str | None = None
right_title: str | None = None
left_thumbnail_url: str | None = None
left_animated_thumbnail_url: str | None = None
right_thumbnail_url: str | None = None
right_animated_thumbnail_url: str | None = None
class MergeCandidateListOut(BaseModel):
items: list[MergeCandidateSummary]
total: int
page: int
per_page: int
class MergeCandidateDetail(BaseModel):
id: uuid.UUID
kind: str
score: float
status: str
reasons: dict
left: SceneOut | None
right: SceneOut | None
class ResolveBody(BaseModel):
action: Literal["merge", "reject"]
keep: Literal["left", "right"] = "left"
resolved_by: str | None = None
class ResolveResult(BaseModel):
id: uuid.UUID
status: str
keep_id: uuid.UUID | None = None
drop_id: uuid.UUID | None = None
# ---- endpoints ------------------------------------------------------------
@router.get("/merge-candidates", response_model=MergeCandidateListOut)
def list_candidates(
session: Annotated[Session, Depends(get_session)],
status: Annotated[str, Query(pattern="^(pending|auto_merged|merged|rejected|all)$")] = "pending",
kind: Annotated[str, Query(pattern="^(scene|performer|studio|all)$")] = "scene",
page: Annotated[int, Query(ge=1)] = 1,
per_page: Annotated[int, Query(ge=1, le=200)] = 50,
) -> MergeCandidateListOut:
base = select(MergeCandidate)
if status != "all":
base = base.where(MergeCandidate.status == MergeStatus(status))
if kind != "all":
base = base.where(MergeCandidate.kind == MergeKind(kind))
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
rows = (
session.execute(
base.order_by(MergeCandidate.score.desc(), MergeCandidate.created_at.desc())
.offset((page - 1) * per_page)
.limit(per_page)
)
.scalars()
.all()
)
# Pre-fetch tytułów scen (gdy kind=scene) dla wygodnego podglądu
titles: dict[uuid.UUID, str] = {}
scene_ids = {r.left_id for r in rows if r.kind == MergeKind.scene} | {
r.right_id for r in rows if r.kind == MergeKind.scene
}
if scene_ids:
for sid, title in session.execute(
select(Scene.id, Scene.title).where(Scene.id.in_(scene_ids))
):
titles[sid] = title
# Pre-fetch po jednym statycznym i animowanym thumbnailu per scenę (mobile queue
# używa statycznego do listy + animowanego po hold-to-preview). Wybieramy najpierw
# napotkany niepusty URL — kolejność rzędów playback_sources nie jest gwarantowana,
# ale dla triage to wystarcza.
thumbs: dict[uuid.UUID, str] = {}
animated_thumbs: dict[uuid.UUID, str] = {}
if scene_ids:
for sid, static_url, animated_url in session.execute(
select(
PlaybackSource.scene_id,
PlaybackSource.thumbnail_url,
PlaybackSource.animated_thumbnail_url,
).where(PlaybackSource.scene_id.in_(scene_ids))
):
if static_url and sid not in thumbs:
thumbs[sid] = static_url
if animated_url and sid not in animated_thumbs:
animated_thumbs[sid] = animated_url
# Fallback: dla scen TPDB/StashDB-only (brak playback_source) wyciągamy
# poster URL z external_records.raw['image' | 'poster' | 'paths.screenshot'].
# Bez tego merge queue ma 70%+ wpisów bez thumb (canonical TPDB↔StashDB pary).
missing = [sid for sid in scene_ids if sid not in thumbs]
if missing:
ext_rows = session.execute(
select(SceneExternalRef.scene_id, ExternalRecord.raw, Source.kind)
.join(
ExternalRecord,
(ExternalRecord.source_id == SceneExternalRef.source_id)
& (ExternalRecord.external_id == SceneExternalRef.external_id),
)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id.in_(missing))
.where(ExternalRecord.entity_kind == "scene")
).all()
for sid, raw, kind in ext_rows:
if sid in thumbs or not isinstance(raw, dict):
continue
url = _raw_to_thumb(raw, kind)
if url:
thumbs[sid] = url
items = [
MergeCandidateSummary(
id=r.id,
kind=r.kind.value,
left_id=r.left_id,
right_id=r.right_id,
score=r.score,
status=r.status.value,
left_title=titles.get(r.left_id),
right_title=titles.get(r.right_id),
left_thumbnail_url=thumbs.get(r.left_id),
right_thumbnail_url=thumbs.get(r.right_id),
left_animated_thumbnail_url=animated_thumbs.get(r.left_id),
right_animated_thumbnail_url=animated_thumbs.get(r.right_id),
)
for r in rows
]
return MergeCandidateListOut(items=items, total=total, page=page, per_page=per_page)
@router.get("/merge-candidates/{candidate_id}", response_model=MergeCandidateDetail)
def get_candidate(
candidate_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> MergeCandidateDetail:
cand = session.get(MergeCandidate, candidate_id)
if cand is None:
raise HTTPException(status_code=404, detail="merge candidate not found")
left_out = right_out = None
if cand.kind == MergeKind.scene:
left_scene = session.get(Scene, cand.left_id)
right_scene = session.get(Scene, cand.right_id)
if left_scene is not None:
left_out = _build_scene_out(session, left_scene)
if right_scene is not None and right_scene.id != cand.left_id:
right_out = _build_scene_out(session, right_scene)
return MergeCandidateDetail(
id=cand.id,
kind=cand.kind.value,
score=cand.score,
status=cand.status.value,
reasons=cand.reasons or {},
left=left_out,
right=right_out,
)
@router.post("/merge-candidates/{candidate_id}/resolve", response_model=ResolveResult)
def resolve(
candidate_id: uuid.UUID,
body: ResolveBody,
session: Annotated[Session, Depends(get_session)],
) -> ResolveResult:
try:
cand = resolve_candidate(
session,
candidate_id=candidate_id,
action=body.action,
keep_left=(body.keep == "left"),
resolved_by=body.resolved_by,
)
except MergeError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
keep_id = drop_id = None
if body.action == "merge":
keep_id = cand.left_id if body.keep == "left" else cand.right_id
drop_id = cand.right_id if body.keep == "left" else cand.left_id
return ResolveResult(id=cand.id, status=cand.status.value, keep_id=keep_id, drop_id=drop_id)
# ---- Bandwidth monitor -----------------------------------------------------
class BandwidthCdnRow(BaseModel):
cdn: str
bytes: int
pretty: str
class BandwidthStats(BaseModel):
"""Per-CDN bytes-out z VPS proxy (rolling buckets). Restart api resetuje.
Hetzner widoczne tylko gdy HETZNER_API_TOKEN i HETZNER_SERVER_ID w env."""
last_1h: list[BandwidthCdnRow]
last_24h: list[BandwidthCdnRow]
last_7d: list[BandwidthCdnRow]
total_bytes_1h: int
total_bytes_24h: int
total_bytes_7d: int
hetzner: dict | None = None
def _fmt_bytes(b: int) -> str:
if b < 1024:
return f"{b} B"
val = float(b)
for u in ("KB", "MB", "GB", "TB"):
val /= 1024
if val < 1024:
return f"{val:.2f} {u}"
return f"{val:.2f} PB"
@router.get("/bandwidth", response_model=BandwidthStats)
def bandwidth_stats() -> BandwidthStats:
"""Per-CDN VPS proxy bytes-out + Hetzner traffic stats.
Critical dla public release pokazuje gdzie VPS bandwidth wycieka. Pozwala
spotted Mixdrop / bandwidth-heavy CDN-y przed Hetzner overage charge.
"""
from app.api.stream_proxy import get_bandwidth_stats
from app.config import get_settings
def _rows(stats: dict[str, int]) -> list[BandwidthCdnRow]:
return [
BandwidthCdnRow(cdn=cdn, bytes=b, pretty=_fmt_bytes(b))
for cdn, b in stats.items()
]
s_1h = get_bandwidth_stats(1)
s_24h = get_bandwidth_stats(24)
s_7d = get_bandwidth_stats(168)
# Hetzner stats — load from cache file (written by check_hetzner_traffic.py cron).
hetzner_data = None
settings = get_settings()
if settings.hetzner_api_token and settings.hetzner_server_id:
import json
from pathlib import Path
cache_path = Path("/tmp/hetzner_traffic.json")
if cache_path.exists():
try:
hetzner_data = json.loads(cache_path.read_text())
except Exception:
pass
return BandwidthStats(
last_1h=_rows(s_1h),
last_24h=_rows(s_24h),
last_7d=_rows(s_7d),
total_bytes_1h=sum(s_1h.values()),
total_bytes_24h=sum(s_24h.values()),
total_bytes_7d=sum(s_7d.values()),
hetzner=hetzner_data,
)

206
app/api/admin_html.py Normal file
View file

@ -0,0 +1,206 @@
"""htmx + Jinja2 admin UI dla MergeCandidate triage.
Endpointy:
GET /ui/ lista pending (filter status)
GET /ui/candidate/{id} side-by-side scen
POST /ui/candidate/{id}/resolve htmx form submit (action=merge_keep_left|merge_keep_right|reject)
zwraca fragment HTML z potwierdzeniem
"""
from __future__ import annotations
import uuid
from pathlib import Path
from typing import Annotated
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from sqlalchemy import func, select
from sqlalchemy.orm import Session
from app.api.scenes import _build_scene_out
from app.auth import require_api_key
from app.db import get_session
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
from app.models.scene import Scene
from app.resolve.scene_merge import MergeError, resolve_candidate
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
_STATIC_DIR = Path(__file__).resolve().parent.parent / "static"
templates = Jinja2Templates(directory=str(_TEMPLATES_DIR))
def _score_class(score: float) -> str:
if score >= 0.92:
return "high"
if score >= 0.75:
return "mid"
return "low"
templates.env.globals["score_class"] = _score_class
router = APIRouter(
prefix="/ui",
tags=["ui"],
dependencies=[Depends(require_api_key)],
)
@router.get("/", response_class=HTMLResponse)
def list_view(
request: Request,
session: Annotated[Session, Depends(get_session)],
status: Annotated[str, Query(pattern="^(pending|auto_merged|merged|rejected|all)$")] = "pending",
page: Annotated[int, Query(ge=1)] = 1,
) -> HTMLResponse:
per_page = 50
base = select(MergeCandidate).where(MergeCandidate.kind == MergeKind.scene)
if status != "all":
base = base.where(MergeCandidate.status == MergeStatus(status))
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
rows = (
session.execute(
base.order_by(MergeCandidate.score.desc(), MergeCandidate.created_at.desc())
.offset((page - 1) * per_page)
.limit(per_page)
)
.scalars()
.all()
)
titles: dict[uuid.UUID, str] = {}
scene_ids = {r.left_id for r in rows} | {r.right_id for r in rows}
if scene_ids:
for sid, title in session.execute(
select(Scene.id, Scene.title).where(Scene.id.in_(scene_ids))
):
titles[sid] = title
items = [
{
"id": r.id,
"kind": r.kind.value,
"left_id": r.left_id,
"right_id": r.right_id,
"score": r.score,
"status": r.status.value,
"left_title": titles.get(r.left_id),
"right_title": titles.get(r.right_id),
}
for r in rows
]
label_map = {
"pending": "Pending",
"auto_merged": "Auto-merged",
"merged": "Merged",
"rejected": "Rejected",
"all": "All",
}
return templates.TemplateResponse(
request,
"candidates_list.html",
{
"items": items,
"total": total,
"page": page,
"per_page": per_page,
"status": status,
"status_label": label_map[status],
},
)
@router.get("/candidate/{candidate_id}", response_class=HTMLResponse)
def detail_view(
candidate_id: uuid.UUID,
request: Request,
session: Annotated[Session, Depends(get_session)],
) -> HTMLResponse:
cand = session.get(MergeCandidate, candidate_id)
if cand is None:
raise HTTPException(status_code=404, detail="merge candidate not found")
left_out = right_out = None
if cand.kind == MergeKind.scene:
left_scene = session.get(Scene, cand.left_id)
right_scene = session.get(Scene, cand.right_id)
if left_scene is not None:
left_out = _build_scene_out(session, left_scene)
if right_scene is not None and right_scene.id != cand.left_id:
right_out = _build_scene_out(session, right_scene)
return templates.TemplateResponse(
request,
"candidate_detail.html",
{
"cand": {
"id": cand.id,
"kind": cand.kind.value,
"score": cand.score,
"status": cand.status.value,
"reasons": cand.reasons or {},
"left": left_out,
"right": right_out,
"left_id": cand.left_id,
"right_id": cand.right_id,
},
},
)
@router.post("/candidate/{candidate_id}/resolve", response_class=HTMLResponse)
def resolve_form(
candidate_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
action: Annotated[str, Form()],
) -> HTMLResponse:
if action not in {"merge_keep_left", "merge_keep_right", "reject"}:
raise HTTPException(status_code=400, detail=f"invalid action: {action}")
api_action = "reject" if action == "reject" else "merge"
keep_left = action != "merge_keep_right"
try:
resolve_candidate(
session,
candidate_id=candidate_id,
action=api_action,
keep_left=keep_left,
resolved_by="ui",
)
except MergeError as exc:
return HTMLResponse(
f'<div class="card" id="actions" style="border-color: var(--bad);">'
f"<strong>error:</strong> {exc}</div>",
status_code=400,
)
label = {
"merge_keep_left": "Merged into LEFT",
"merge_keep_right": "Merged into RIGHT",
"reject": "Rejected (kept both)",
}[action]
return HTMLResponse(
f'<div class="card" id="actions" style="border-color: var(--good);">'
f"<strong>{label}.</strong> "
f'<a href="/ui/">← back to list</a></div>'
)
def mount_static(app) -> None: # pragma: no cover - dev convenience
# APK MIME type — bez tego Android Browser nie traktuje pliku jako instalable APK
# (text/plain → "Plik został pobrany" zamiast prompta install). Rejestracja jest
# idempotentna na poziomie procesu — bezpiecznie wywoływać przy każdym startup.
import mimetypes
mimetypes.add_type("application/vnd.android.package-archive", ".apk")
if _STATIC_DIR.exists():
app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static")

116
app/api/blacklist.py Normal file
View file

@ -0,0 +1,116 @@
"""Blacklists — globalnie ukryte performerki/studia/tagi.
Sceny które MAJĄ blacklisted entity wypadają z każdego /scenes (pełna lista, search,
performer scenes, tag scenes). Auto-apply w `app/api/scenes.py`.
Endpointy:
GET /blacklist wszystkie 3 listy w jednym response
POST /blacklist/{kind}/{entity_id} dodaj (idempotent)
DELETE /blacklist/{kind}/{entity_id} usuń
`kind` {performer, studio, tag}.
"""
from __future__ import annotations
import uuid
from typing import Annotated, Literal
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.db import get_session
from app.models.blacklist import (
BlacklistedPerformer,
BlacklistedStudio,
BlacklistedTag,
)
from app.models.performer import Performer
from app.models.studio import Studio
from app.models.tag import Tag
router = APIRouter(
prefix="/blacklist", tags=["blacklist"], dependencies=[Depends(require_api_key)]
)
Kind = Literal["performer", "studio", "tag"]
class BlacklistEntry(BaseModel):
id: uuid.UUID
name: str # canonical_name (performer) / name (studio/tag)
slug: str | None = None
class BlacklistOut(BaseModel):
performers: list[BlacklistEntry]
studios: list[BlacklistEntry]
tags: list[BlacklistEntry]
@router.get("", response_model=BlacklistOut)
def list_blacklist(
session: Annotated[Session, Depends(get_session)],
) -> BlacklistOut:
perfs = session.execute(
select(BlacklistedPerformer.performer_id, Performer.canonical_name, Performer.slug)
.join(Performer, Performer.id == BlacklistedPerformer.performer_id)
.order_by(Performer.canonical_name)
).all()
studios = session.execute(
select(BlacklistedStudio.studio_id, Studio.name, Studio.slug)
.join(Studio, Studio.id == BlacklistedStudio.studio_id)
.order_by(Studio.name)
).all()
tags = session.execute(
select(BlacklistedTag.tag_id, Tag.name, Tag.slug)
.join(Tag, Tag.id == BlacklistedTag.tag_id)
.order_by(Tag.name)
).all()
return BlacklistOut(
performers=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in perfs],
studios=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in studios],
tags=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in tags],
)
def _kind_to_entity(kind: Kind):
if kind == "performer":
return BlacklistedPerformer, Performer, "performer_id"
if kind == "studio":
return BlacklistedStudio, Studio, "studio_id"
if kind == "tag":
return BlacklistedTag, Tag, "tag_id"
raise HTTPException(status_code=400, detail="kind must be performer|studio|tag")
@router.post("/{kind}/{entity_id}", status_code=status.HTTP_200_OK)
def add_blacklist(
kind: Kind,
entity_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> dict:
bl_model, parent_model, fk = _kind_to_entity(kind)
if session.get(parent_model, entity_id) is None:
raise HTTPException(status_code=404, detail=f"{kind} not found")
if session.get(bl_model, entity_id) is not None:
return {"kind": kind, "id": str(entity_id), "created": False}
session.add(bl_model(**{fk: entity_id}))
session.commit()
return {"kind": kind, "id": str(entity_id), "created": True}
@router.delete("/{kind}/{entity_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_blacklist(
kind: Kind,
entity_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
bl_model, _, _ = _kind_to_entity(kind)
row = session.get(bl_model, entity_id)
if row is None:
return # idempotent
session.delete(row)
session.commit()

155
app/api/bug_reports.py Normal file
View file

@ -0,0 +1,155 @@
"""Bug reports — mobile FAB → POST /bug-reports → admin lista przez admin_html.
POST nie wymaga obecnego scene_id (user może raportować z FavoritesScreen,
SearchScreen itp.). Screenshot opcjonalny niektóre ekrany nie warto kapturować.
Limit body 1.5MB (FastAPI default jest hojny, ale dla rozsądku ograniczamy).
Screenshot to PNG/JPEG z react-native-view-shot, base64 typowe rozmiary:
- mały ekran scene-list: ~200-400KB
- duży scene-detail z thumbnail: ~600KB-1MB
"""
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field
from sqlalchemy import desc, func, select
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.db import get_session
from app.models.bug_report import BugReport
from app.models.movie import Movie
from app.models.scene import Scene
router = APIRouter(tags=["bug-reports"], dependencies=[Depends(require_api_key)])
_MAX_SCREENSHOT_BYTES = 1_500_000 # raw base64 chars; ~1.1MB binary po dekodowaniu
class BugReportCreate(BaseModel):
message: str = Field(min_length=1, max_length=5000)
screen_name: str | None = Field(default=None, max_length=64)
app_version: str | None = Field(default=None, max_length=32)
scene_id: uuid.UUID | None = None
screenshot_b64: str | None = Field(default=None, max_length=_MAX_SCREENSHOT_BYTES)
class BugReportOut(BaseModel):
id: uuid.UUID
created_at: datetime
screen_name: str | None
app_version: str | None
scene_id: uuid.UUID | None
movie_id: uuid.UUID | None
message: str
has_screenshot: bool
resolved: bool
class BugReportListOut(BaseModel):
items: list[BugReportOut]
total: int
@router.post("/bug-reports", status_code=status.HTTP_201_CREATED)
def create_bug_report(
payload: BugReportCreate,
session: Annotated[Session, Depends(get_session)],
) -> dict[str, str]:
# Smart-route entity_id: mobile Player używa `sceneId` param zarówno dla
# scen jak i movies (legacy progress tracking hack). Bez tego INSERT FK
# violation crashował 500 (zgłoszone 2026-05-10). Sprawdź obie tabele.
scene_id: uuid.UUID | None = None
movie_id: uuid.UUID | None = None
if payload.scene_id is not None:
if session.get(Scene, payload.scene_id) is not None:
scene_id = payload.scene_id
elif session.get(Movie, payload.scene_id) is not None:
movie_id = payload.scene_id
# else: ID nie istnieje już nigdzie (deleted) — drop oba na null
br = BugReport(
id=uuid.uuid4(),
message=payload.message.strip(),
screen_name=payload.screen_name,
app_version=payload.app_version,
scene_id=scene_id,
movie_id=movie_id,
screenshot_b64=payload.screenshot_b64,
)
session.add(br)
session.commit()
return {"id": str(br.id)}
@router.get("/bug-reports", response_model=BugReportListOut)
def list_bug_reports(
session: Annotated[Session, Depends(get_session)],
limit: int = 50,
offset: int = 0,
include_resolved: bool = False,
) -> BugReportListOut:
q = select(BugReport).order_by(desc(BugReport.created_at))
cnt_q = select(func.count(BugReport.id))
if not include_resolved:
q = q.where(BugReport.resolved.is_(False))
cnt_q = cnt_q.where(BugReport.resolved.is_(False))
rows = session.scalars(q.limit(limit).offset(offset)).all()
total = session.scalar(cnt_q) or 0
items = [
BugReportOut(
id=r.id,
created_at=r.created_at,
screen_name=r.screen_name,
app_version=r.app_version,
scene_id=r.scene_id,
movie_id=r.movie_id,
message=r.message,
has_screenshot=bool(r.screenshot_b64),
resolved=r.resolved,
)
for r in rows
]
return BugReportListOut(items=items, total=total)
@router.get("/bug-reports/{bug_id}/screenshot")
def get_bug_report_screenshot(
bug_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> dict[str, str | None]:
"""Zwraca base64-encoded screenshot (jeśli jest) — admin UI go renderuje."""
br = session.get(BugReport, bug_id)
if br is None:
raise HTTPException(status_code=404, detail="not found")
return {"screenshot_b64": br.screenshot_b64}
@router.post("/bug-reports/{bug_id}/resolve")
def resolve_bug_report(
bug_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> dict[str, str]:
br = session.get(BugReport, bug_id)
if br is None:
raise HTTPException(status_code=404, detail="not found")
br.resolved = True
session.commit()
return {"status": "resolved"}
@router.delete("/bug-reports/{bug_id}", status_code=status.HTTP_204_NO_CONTENT)
def delete_bug_report(
bug_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
br = session.get(BugReport, bug_id)
if br is None:
raise HTTPException(status_code=404, detail="not found")
session.delete(br)
session.commit()

104
app/api/expo_updates.py Normal file
View file

@ -0,0 +1,104 @@
"""Expo Updates serving endpoints (OTA JS bundle distribution).
Mobile sprawdza `/expo-updates/manifest` przy każdym launch (lub on-foreground).
Serwer zwraca aktualny manifest dla danego `expo-runtime-version`. Mobile pobiera
launchAsset (bundle) + assets, zapisuje, restartuje aplikację z nowym bundle.
Każdy update wgrany przez `scripts/publish_update.py` ląduje w
`app/static/expo-updates/<runtime>/<update_id>/`. Plik
`app/static/expo-updates/<runtime>/current.json` wskazuje aktywny update_id.
Endpointy PUBLICZNE (no auth) Expo Updates SDK nie wstrzykuje X-API-Key.
Bezpieczeństwo opiera się na TLS pinningu (mobile ufa tylko naszej self-signed
cert SPKI z network_security_config) ktoś bez tego pinu nie podstawi MITM
manifestu. Jeśli kiedyś trzeba twardo: dorobić expo-updates code signing key.
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from fastapi import APIRouter, Header, HTTPException, Query
from fastapi.responses import FileResponse, JSONResponse, Response
log = logging.getLogger(__name__)
router = APIRouter(tags=["expo-updates"])
_STATIC_DIR = Path(__file__).resolve().parent.parent / "static" / "expo-updates"
@router.get("/expo-updates/manifest")
def get_manifest(
expo_runtime_version: str | None = Header(default=None, alias="expo-runtime-version"),
expo_platform: str | None = Header(default=None, alias="expo-platform"),
) -> Response:
"""Zwraca aktualny manifest dla podanego `expo-runtime-version` (default 1.0)
+ platform (default android i tak tylko Android wspieramy).
204 No Content gdy nie ma update'u dla tego runtime'u klient nadal odpala
embedded bundle z APK. Mobile zna `expo-protocol-version` (single-manifest
Mode), więc nie potrzebujemy multipart.
"""
runtime = expo_runtime_version or "1.0"
runtime_dir = _STATIC_DIR / runtime
current_file = runtime_dir / "current.json"
if not current_file.exists():
return Response(status_code=204)
try:
current = json.loads(current_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as e:
log.warning("expo-updates: bad current.json for runtime=%s: %s", runtime, e)
return Response(status_code=204)
update_id = current.get("update_id")
if not update_id:
return Response(status_code=204)
manifest_file = runtime_dir / update_id / "manifest.json"
if not manifest_file.exists():
log.warning("expo-updates: current points to missing update %s", update_id)
return Response(status_code=204)
try:
manifest = json.loads(manifest_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as e:
log.error("expo-updates: bad manifest.json for %s: %s", update_id, e)
return Response(status_code=204)
return JSONResponse(
manifest,
headers={
"expo-protocol-version": "1",
"expo-sfv-version": "0",
"cache-control": "private, max-age=0",
"content-type": "application/json; charset=utf-8",
},
)
@router.get("/expo-updates/asset")
def get_asset(
asset: str = Query(..., description="Relative path do pliku w runtime dir"),
runtimeVersion: str = Query("1.0"),
platform: str = Query("android"),
) -> Response:
"""Serwuje pojedynczy asset (JS bundle, image, font) z update directory.
`asset` to relative path względem `static/expo-updates/<runtime>/`
zwykle `<update_id>/_expo/static/js/android/<hash>.js` lub
`<update_id>/assets/<hash>`. Path traversal blocked przez resolve+is_relative.
"""
runtime_dir = (_STATIC_DIR / runtimeVersion).resolve()
target = (runtime_dir / asset).resolve()
if not str(target).startswith(str(runtime_dir)):
raise HTTPException(status_code=400, detail="invalid asset path")
if not target.exists() or not target.is_file():
raise HTTPException(status_code=404, detail="asset not found")
# Content type — bundle to text/javascript, reszta autodetect przez FileResponse.
media_type = None
if target.suffix in (".js", ".bundle"):
media_type = "application/javascript"
return FileResponse(target, media_type=media_type)

457
app/api/favorites.py Normal file
View file

@ -0,0 +1,457 @@
"""Favorites — ulubione performerki + studia + liczenie nowych scen.
Single-user (brak users), więc API zwraca/operuje na global zbiorze. Multi-user
można dodać dorzuceniem `user_id` query/header bez breaking change.
Endpointy (performers `/favorites/...` zostawione żeby nie łamać starego mobile):
GET /favorites lista ulubionych performerek
POST /favorites/{performer_id} dodaj (idempotent)
DELETE /favorites/{performer_id} usuń
POST /favorites/{performer_id}/seen mark-as-seen (zeruje badge)
Endpointy (studios):
GET /favorites/studios lista ulubionych studiów
POST /favorites/studios/{studio_id} dodaj
DELETE /favorites/studios/{studio_id} usuń
POST /favorites/studios/{studio_id}/seen mark-as-seen
"Nowa scena" = scena której Scene.created_at > favorite.last_seen_at:
- dla performerki: ScenePerformer.performer_id = X
- dla studio: Scene.studio_id = X
"""
from __future__ import annotations
import uuid
from datetime import UTC, datetime
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy import func, select
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.db import get_session
from app.models.favorite_movie import FavoriteMovie
from app.models.favorite_performer import FavoritePerformer
from app.models.favorite_studio import FavoriteStudio
from app.models.movie import Movie
from app.models.performer import Performer
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, ScenePerformer
from app.models.studio import Studio
router = APIRouter(
prefix="/favorites", tags=["favorites"], dependencies=[Depends(require_api_key)]
)
class FavoriteOut(BaseModel):
performer_id: uuid.UUID
canonical_name: str
slug: str | None
scene_count: int
new_count: int # sceny od last_seen_at
last_seen_at: datetime
created_at: datetime
class FavoriteListOut(BaseModel):
items: list[FavoriteOut]
total: int
new_total: int # suma new_count po wszystkich — dla badge w toolbar
@router.get("", response_model=FavoriteListOut)
def list_favorites(
session: Annotated[Session, Depends(get_session)],
) -> FavoriteListOut:
rows = session.execute(
select(FavoritePerformer, Performer)
.join(Performer, Performer.id == FavoritePerformer.performer_id)
.order_by(Performer.canonical_name)
).all()
if not rows:
return FavoriteListOut(items=[], total=0, new_total=0)
perf_ids = [perf.id for _, perf in rows]
last_seen_by_perf = {fav.performer_id: fav.last_seen_at for fav, _ in rows}
# Batch: scene_count per performer — filtrujemy `has_live_playback` żeby badge
# `N scenes` zgadzał się z tym co widać w PerformerScenes (mobile filtruje
# `has_playback=true`). TPDB/StashDB sync wstawia metadata-only stubs które wlicz
# by się w 2062 dla Aletta Ocean ale w profilu pokazuje tylko 499 oglądalnych.
from sqlalchemy import and_, exists
_scene_count_live_playback = exists().where(
and_(
PlaybackSource.scene_id == ScenePerformer.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
scene_counts: dict = dict(
session.execute(
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id))
.where(ScenePerformer.performer_id.in_(perf_ids))
.where(_scene_count_live_playback)
.group_by(ScenePerformer.performer_id)
).all()
)
# Batch: new_count per performer — sceny z created_at > last_seen_at favorite'a.
# Każda performerka ma INNY last_seen_at, więc warunek per-row. Trick: GREATEST jest
# nieważny — robimy CASE per row z mapowaniem perf_id → last_seen przez VALUES list.
# Prościej: jeden join + WHERE z OR po wszystkich (perf_id=X AND created_at>ts_X) —
# ale to N OR-ów. Najczystsze rozwiązanie: zapytaj per-row ale wszystkie naraz w
# SQL używając IN tuple lub sub-query. Tu korzystamy z faktu że N=14 typowo, więc
# robimy unionall albo prosty (perf_id, last_seen_at) JOIN.
new_counts: dict = {}
if perf_ids:
# Liczymy TYLKO sceny z żywym playback_source (has_live_playback). Powód:
# TPDB/StashDB sync wstawia metadata-only stubs (52 scen Danielle Renae jednego
# dnia z 0 playback) — bumpują created_at, badge `+N`, ale w PerformerScenes
# mobile filtruje `has_playback=true` → 0 widocznych. Result: user widzi +48
# ale w profilu nic nowego. Filter aligns count z faktycznie oglądalnym
# contentem ("new znalezisko" = scena którą da się odtworzyć).
from sqlalchemy import and_, exists
live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
per_scene_rows = session.execute(
select(ScenePerformer.performer_id, Scene.created_at)
.join(Scene, Scene.id == ScenePerformer.scene_id)
.where(ScenePerformer.performer_id.in_(perf_ids))
.where(live_playback)
).all()
for pid, created_at in per_scene_rows:
if created_at is None:
continue
if created_at > last_seen_by_perf.get(pid):
new_counts[pid] = new_counts.get(pid, 0) + 1
items: list[FavoriteOut] = []
new_total = 0
for fav, perf in rows:
nc = new_counts.get(perf.id, 0)
new_total += nc
items.append(
FavoriteOut(
performer_id=perf.id,
canonical_name=perf.canonical_name,
slug=perf.slug,
scene_count=scene_counts.get(perf.id, 0),
new_count=nc,
last_seen_at=fav.last_seen_at,
created_at=fav.created_at,
)
)
return FavoriteListOut(items=items, total=len(items), new_total=new_total)
class FavoriteAddOut(BaseModel):
performer_id: uuid.UUID
created: bool
@router.post(
"/{performer_id}",
response_model=FavoriteAddOut,
status_code=status.HTTP_200_OK,
)
def add_favorite(
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> FavoriteAddOut:
perf = session.get(Performer, performer_id)
if perf is None:
raise HTTPException(status_code=404, detail="performer not found")
existing = session.get(FavoritePerformer, performer_id)
if existing is not None:
return FavoriteAddOut(performer_id=performer_id, created=False)
session.add(FavoritePerformer(performer_id=performer_id))
session.commit()
return FavoriteAddOut(performer_id=performer_id, created=True)
@router.delete("/{performer_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_favorite(
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
fav = session.get(FavoritePerformer, performer_id)
if fav is None:
# idempotent — brak ulubionego = nie ma nic do usunięcia, success
return
session.delete(fav)
session.commit()
class SeenOut(BaseModel):
performer_id: uuid.UUID
last_seen_at: datetime
@router.post("/{performer_id}/seen", response_model=SeenOut)
def mark_seen(
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SeenOut:
fav = session.get(FavoritePerformer, performer_id)
if fav is None:
raise HTTPException(status_code=404, detail="not in favorites")
fav.last_seen_at = datetime.now(UTC)
session.commit()
return SeenOut(performer_id=performer_id, last_seen_at=fav.last_seen_at)
# ---------- Studios ----------
class FavoriteStudioOut(BaseModel):
studio_id: uuid.UUID
name: str
slug: str
network: str | None = None
scene_count: int
new_count: int
last_seen_at: datetime
created_at: datetime
class FavoriteStudioListOut(BaseModel):
items: list[FavoriteStudioOut]
total: int
new_total: int
@router.get("/studios", response_model=FavoriteStudioListOut)
def list_favorite_studios(
session: Annotated[Session, Depends(get_session)],
) -> FavoriteStudioListOut:
rows = session.execute(
select(FavoriteStudio, Studio)
.join(Studio, Studio.id == FavoriteStudio.studio_id)
.order_by(Studio.name)
).all()
if not rows:
return FavoriteStudioListOut(items=[], total=0, new_total=0)
studio_ids = [st.id for _, st in rows]
last_seen_by_studio = {fav.studio_id: fav.last_seen_at for fav, _ in rows}
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
from sqlalchemy import and_, exists
_studio_count_live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
scene_counts: dict = dict(
session.execute(
select(Scene.studio_id, func.count(Scene.id))
.where(Scene.studio_id.in_(studio_ids))
.where(_studio_count_live_playback)
.group_by(Scene.studio_id)
).all()
)
new_counts: dict = {}
if studio_ids:
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
from sqlalchemy import and_, exists
live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
per_scene_rows = session.execute(
select(Scene.studio_id, Scene.created_at)
.where(Scene.studio_id.in_(studio_ids))
.where(live_playback)
).all()
for sid, created_at in per_scene_rows:
if created_at is None:
continue
if created_at > last_seen_by_studio.get(sid):
new_counts[sid] = new_counts.get(sid, 0) + 1
items: list[FavoriteStudioOut] = []
new_total = 0
for fav, st in rows:
nc = new_counts.get(st.id, 0)
new_total += nc
items.append(
FavoriteStudioOut(
studio_id=st.id,
name=st.name,
slug=st.slug,
network=st.network,
scene_count=scene_counts.get(st.id, 0),
new_count=nc,
last_seen_at=fav.last_seen_at,
created_at=fav.created_at,
)
)
return FavoriteStudioListOut(items=items, total=len(items), new_total=new_total)
class FavoriteStudioAddOut(BaseModel):
studio_id: uuid.UUID
created: bool
@router.post(
"/studios/{studio_id}",
response_model=FavoriteStudioAddOut,
status_code=status.HTTP_200_OK,
)
def add_favorite_studio(
studio_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> FavoriteStudioAddOut:
st = session.get(Studio, studio_id)
if st is None:
raise HTTPException(status_code=404, detail="studio not found")
existing = session.get(FavoriteStudio, studio_id)
if existing is not None:
return FavoriteStudioAddOut(studio_id=studio_id, created=False)
session.add(FavoriteStudio(studio_id=studio_id))
session.commit()
return FavoriteStudioAddOut(studio_id=studio_id, created=True)
@router.delete("/studios/{studio_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_favorite_studio(
studio_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
fav = session.get(FavoriteStudio, studio_id)
if fav is None:
return
session.delete(fav)
session.commit()
class SeenStudioOut(BaseModel):
studio_id: uuid.UUID
last_seen_at: datetime
@router.post("/studios/{studio_id}/seen", response_model=SeenStudioOut)
def mark_studio_seen(
studio_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SeenStudioOut:
fav = session.get(FavoriteStudio, studio_id)
if fav is None:
raise HTTPException(status_code=404, detail="not in favorites")
fav.last_seen_at = datetime.now(UTC)
session.commit()
return SeenStudioOut(studio_id=studio_id, last_seen_at=fav.last_seen_at)
# ── Favorite movies ────────────────────────────────────────────────────────
# Movies nie mają child scenes per-favorite (jak performerki/studia), więc
# `last_seen_at` nie jest tu używany do NEW count — tylko jako tracking ostatniego
# wglądu przez usera. Mobile używa NEW badge w liście /movies przez OSOBNY
# globalny last_seen z AsyncStorage (client-side, brak backendowego state).
class FavoriteMovieOut(BaseModel):
movie_id: uuid.UUID
title: str
slug: str | None
poster_url: str | None
release_year: int | None
studio_name: str | None
last_seen_at: datetime
created_at: datetime
class FavoriteMovieListOut(BaseModel):
items: list[FavoriteMovieOut]
total: int
@router.get("/movies", response_model=FavoriteMovieListOut)
def list_favorite_movies(
session: Annotated[Session, Depends(get_session)],
) -> FavoriteMovieListOut:
rows = session.execute(
select(FavoriteMovie, Movie, Studio)
.join(Movie, Movie.id == FavoriteMovie.movie_id)
.outerjoin(Studio, Studio.id == Movie.studio_id)
.order_by(Movie.title)
).all()
items = [
FavoriteMovieOut(
movie_id=movie.id,
title=movie.title,
slug=movie.slug,
poster_url=movie.poster_url,
release_year=movie.release_year,
studio_name=studio.name if studio else None,
last_seen_at=fav.last_seen_at,
created_at=fav.created_at,
)
for fav, movie, studio in rows
]
return FavoriteMovieListOut(items=items, total=len(items))
class FavoriteMovieAddOut(BaseModel):
movie_id: uuid.UUID
created: bool
@router.post(
"/movies/{movie_id}",
response_model=FavoriteMovieAddOut,
status_code=status.HTTP_200_OK,
)
def add_favorite_movie(
movie_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> FavoriteMovieAddOut:
movie = session.get(Movie, movie_id)
if movie is None:
raise HTTPException(status_code=404, detail="movie not found")
existing = session.get(FavoriteMovie, movie_id)
if existing is not None:
return FavoriteMovieAddOut(movie_id=movie_id, created=False)
session.add(FavoriteMovie(movie_id=movie_id))
session.commit()
return FavoriteMovieAddOut(movie_id=movie_id, created=True)
@router.delete("/movies/{movie_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_favorite_movie(
movie_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
fav = session.get(FavoriteMovie, movie_id)
if fav is None:
return
session.delete(fav)
session.commit()
class SeenMovieOut(BaseModel):
movie_id: uuid.UUID
last_seen_at: datetime
@router.post("/movies/{movie_id}/seen", response_model=SeenMovieOut)
def mark_movie_seen(
movie_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SeenMovieOut:
fav = session.get(FavoriteMovie, movie_id)
if fav is None:
raise HTTPException(status_code=404, detail="not in favorites")
fav.last_seen_at = datetime.now(UTC)
session.commit()
return SeenMovieOut(movie_id=movie_id, last_seen_at=fav.last_seen_at)

275
app/api/movies.py Normal file
View file

@ -0,0 +1,275 @@
"""GET /movies — lista i szczegóły filmów."""
from __future__ import annotations
import uuid
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy import exists, func, select
from sqlalchemy.orm import Session
from app.api.schemas import (
ExternalRefOut,
MovieChapterOut,
MovieListOut,
MovieOut,
PerformerOut,
PlaybackSourceOut,
StudioOut,
TagOut,
)
from app.auth import require_api_key
from app.db import get_session
from app.models.movie import (
Movie,
MovieChapter,
MovieExternalRef,
MoviePerformer,
MovieTag,
)
from app.models.favorite_movie import FavoriteMovie
from app.models.movie_playback_source import MoviePlaybackSource
from app.models.performer import Performer
from app.models.source import Source
from app.models.studio import Studio
from app.models.tag import Tag
router = APIRouter(prefix="/movies", tags=["movies"], dependencies=[Depends(require_api_key)])
_VALID_SORTS = {"created_at", "release_year", "release_date", "title", "rating"}
def _split_csv(raw: str | None) -> list[str]:
if not raw:
return []
return [s.strip() for s in raw.split(",") if s.strip()]
@router.get("", response_model=MovieListOut)
def list_movies(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None, description="Title search (trgm)"),
studio_slugs: str | None = Query(default=None, description="Comma-separated studio slugs (OR)"),
tags: str | None = Query(default=None, description="Comma-separated tag slugs (AND)"),
performer_ids: str | None = Query(default=None, description="Comma-separated performer UUIDs (AND)"),
year_from: int | None = Query(default=None, ge=1900, le=2100),
year_to: int | None = Query(default=None, ge=1900, le=2100),
has_playback: bool | None = Query(default=None),
sort: str = Query(default="created_at"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=200),
) -> MovieListOut:
if sort not in _VALID_SORTS:
raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")
base = select(Movie)
if q:
base = base.where(Movie.title_normalized.ilike(f"%{q.lower()}%"))
studio_slug_list = _split_csv(studio_slugs)
if studio_slug_list:
base = base.where(
Movie.studio_id.in_(select(Studio.id).where(Studio.slug.in_(studio_slug_list)))
)
for slug in _split_csv(tags):
base = base.where(
exists(
select(1).select_from(MovieTag).join(Tag, Tag.id == MovieTag.tag_id)
.where(MovieTag.movie_id == Movie.id, Tag.slug == slug)
)
)
perf_id_strings = _split_csv(performer_ids)
if perf_id_strings:
try:
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
except ValueError as e:
raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
for pid in perf_ids:
base = base.where(
exists(
select(1).select_from(MoviePerformer).where(
MoviePerformer.movie_id == Movie.id,
MoviePerformer.performer_id == pid,
)
)
)
if year_from is not None:
base = base.where(Movie.release_year >= year_from)
if year_to is not None:
base = base.where(Movie.release_year <= year_to)
if has_playback is True:
base = base.where(
exists(
select(1).where(
MoviePlaybackSource.movie_id == Movie.id,
MoviePlaybackSource.dead_at.is_(None),
)
)
)
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if sort == "created_at":
base = base.order_by(Movie.created_at.desc())
elif sort == "release_year":
base = base.order_by(Movie.release_year.desc().nulls_last(), Movie.created_at.desc())
elif sort == "release_date":
base = base.order_by(Movie.release_date.desc().nulls_last(), Movie.created_at.desc())
elif sort == "title":
base = base.order_by(Movie.title_normalized.asc())
elif sort == "rating":
base = base.order_by(Movie.rating.desc().nulls_last(), Movie.created_at.desc())
base = base.limit(per_page).offset((page - 1) * per_page)
movies = session.execute(base).scalars().all()
items = [_movie_to_out(session, m) for m in movies]
return MovieListOut(items=items, total=total, page=page, per_page=per_page)
# Movie playback origin policy — module-level (kiedyś było inline per-request
# definition, code-review #19 — perf hit + dorosły kod).
# Ranking ustalony ad-hoc 2026-05-09 (extract_stream_from_hoster na 5 sample
# random per origin).
_MOVIE_PREFERRED_ORIGINS = (
"mangoporn:luluvid", # KVS, działa
"mangoporn:mixdrop", # po domain fix może działać
"mangoporn:voe", # czasem yt-dlp łapie
"mangoporn",
"streamporn",
"pandamovies",
)
# File hosters które NIGDY nie dadzą się stream-extract bez premium account —
# odfiltrowywane całkowicie (zaśmiecały listę watch options, bug-report
# 2026-05-15). Streamtape przywrócony 2026-05-15 — ma dedicated extractor,
# ~5% URLów żyje.
_MOVIE_DROP_ORIGINS = frozenset({
"mangoporn:rapidgator",
"mangoporn:nitroflare",
"mangoporn:frdl",
})
# Raw landing origins ukrywane gdy są sub-hosters (zob. komentarz w get_movie).
_MOVIE_LANDING_HIDE = frozenset({"mangoporn", "pandamovies", "streamporn"})
def _movie_origin_priority(origin: str) -> int:
try:
return _MOVIE_PREFERRED_ORIGINS.index(origin)
except ValueError:
return 500 # neutralne (paradisehill, mangoporn:* nieklasyfikowane)
@router.get("/{movie_id}", response_model=MovieOut)
def get_movie(
movie_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> MovieOut:
movie = session.get(Movie, movie_id)
if movie is None:
raise HTTPException(status_code=404, detail="movie not found")
return _movie_to_out(session, movie)
def _movie_to_out(session: Session, movie: Movie) -> MovieOut:
studio_out: StudioOut | None = None
if movie.studio_id:
studio = session.get(Studio, movie.studio_id)
if studio is not None:
studio_out = StudioOut.model_validate(studio)
performer_rows = session.execute(
select(Performer, MoviePerformer.as_alias)
.join(MoviePerformer, MoviePerformer.performer_id == Performer.id)
.where(MoviePerformer.movie_id == movie.id)
.order_by(MoviePerformer.position.asc().nulls_last())
).all()
performers = [
PerformerOut(
id=p.id,
canonical_name=p.canonical_name,
slug=p.slug,
gender=p.gender.value if p.gender else None,
as_alias=alias,
)
for p, alias in performer_rows
]
tag_rows = session.execute(
select(Tag).join(MovieTag, MovieTag.tag_id == Tag.id)
.where(MovieTag.movie_id == movie.id)
.order_by(Tag.name.asc())
).scalars().all()
tags = [TagOut.model_validate(t) for t in tag_rows]
chapter_rows = session.execute(
select(MovieChapter).where(MovieChapter.movie_id == movie.id)
.order_by(MovieChapter.chapter_index.asc())
).scalars().all()
chapters = [MovieChapterOut.model_validate(c) for c in chapter_rows]
ref_rows = session.execute(
select(MovieExternalRef, Source.name)
.join(Source, Source.id == MovieExternalRef.source_id)
.where(MovieExternalRef.movie_id == movie.id)
).all()
external_refs = [
ExternalRefOut(
source=name,
external_id=ref.external_id,
url=ref.url,
last_seen=ref.last_seen,
)
for ref, name in ref_rows
]
pb_rows = session.execute(
select(MoviePlaybackSource)
.where(MoviePlaybackSource.movie_id == movie.id)
.where(MoviePlaybackSource.dead_at.is_(None))
.order_by(MoviePlaybackSource.created_at.desc())
).scalars().all()
pb_rows = [p for p in pb_rows if p.origin not in _MOVIE_DROP_ORIGINS]
# Bug-report 2026-05-16: raw landing origins (`mangoporn`/`pandamovies`/
# `streamporn` BEZ `:host`) otwierały WebView z reklamami pełnoekranowymi
# i myliły usera. Ukrywamy raw landing GDY ten sam movie ma co najmniej
# jeden sub-host entry (origin zawiera `:`). Jeśli movie nie ma sub-hosters
# (bo theme HTML się zmienił lub regex nie złapał), zostawiamy landing jako
# last-resort.
has_subhost = any(":" in p.origin for p in pb_rows)
if has_subhost:
pb_rows = [p for p in pb_rows if p.origin not in _MOVIE_LANDING_HIDE]
pb_rows = sorted(pb_rows, key=lambda p: _movie_origin_priority(p.origin))
playback_sources = [PlaybackSourceOut.model_validate(p) for p in pb_rows]
is_fav = session.get(FavoriteMovie, movie.id) is not None
return MovieOut(
id=movie.id,
title=movie.title,
slug=movie.slug,
release_year=movie.release_year,
release_date=movie.release_date,
duration_sec=movie.duration_sec,
description=movie.description,
director=movie.director,
country=movie.country,
rating=movie.rating,
poster_url=movie.poster_url,
backdrop_url=movie.backdrop_url,
studio=studio_out,
performers=performers,
tags=tags,
chapters=chapters,
external_refs=external_refs,
playback_sources=playback_sources,
created_at=movie.created_at,
is_favorite=is_fav,
)

540
app/api/playback.py Normal file
View file

@ -0,0 +1,540 @@
"""POST /scenes/{scene_id}/playback/{playback_id}/resolve — rozwiązuje stream URL.
Mobile apka woła ten endpoint na klik "Watch" backend ekstraktuje świeży
stream URL (m3u8/mp4) z page tube'a i zwraca go. Mobile otwiera URL przez
Linking.openURL Android player chooser (MX Player / VLC / browser).
Stream URLs podpisane/expire (zwykle ~kilka godzin) nie cache'ujemy ich
w DB, tylko resolve on-demand. Logika ekstrakcji per-tube w `app.extractors`.
**Dead-link detection**: gdy hoster embed page mówi "Video deleted/not found",
oznaczamy `PlaybackSource.dead_at = now()` API dalej go nie listuje, mobile
nie pokaże martwego buttonu.
"""
from __future__ import annotations
import logging
import re
import uuid
from datetime import UTC, datetime
from typing import Annotated, Any
from fastapi import APIRouter, Depends, HTTPException, Request, status
from pydantic import BaseModel
from sqlalchemy.orm import Session
from app.api.schemas import PlaybackSourceOut
from app.auth import require_api_key
from app.db import get_session
from app.extractors import (
HosterDead,
StreamSource,
TubePageError,
extract_stream_from_hoster,
try_extract,
)
from app.models.playback_source import PlaybackSource
log = logging.getLogger(__name__)
router = APIRouter(prefix="/scenes", tags=["playback"], dependencies=[Depends(require_api_key)])
# CDN-domain allowlist dla mobile direct fetch — token IS time-bound (nie IP-bound),
# zweryfikowane cross-IP curl test 2026-05-18. Mobile ExoPlayer pobiera manifest+segments
# bezpośrednio z CDN, **zero VPS bandwidth**. Critical dla public release (TB+/miesiąc).
#
# Verified time-bound:
# - xvideos-cdn.com, xnxx-cdn.com (WGCZ Holding) — signed token w path + exp_time
# - phncdn.com (pornhub), ypncdn.com (youporn), rdtcdn.com (redtube) — validfrom+validto+hash
# - privatehost.com (pornhat CDN) — sign + exp_time, brak Referer requirement
# - sxyprn.com — signed path
# - eporner.com CDN — IP literal w path ale CDN go ignoruje
#
# NIE w allowlist (IP-bound, wymagają proxy):
# - premilkyway.com (latestpornvideo) — 403 cross-IP
# - tnmr.org (mypornerleak) — 403 cross-IP
# - porntrex.com/get_file — single-use token (410 po reuse)
# - freshporno.org/get_file — cv= signed token IP-bound
# - sn.porn-xp.com, porn00.org — force_proxy explicit
_TIME_BOUND_CDN_RE = re.compile(
r"\b(?:"
r"xvideos-cdn|xnxx-cdn|phncdn|ypncdn|rdtcdn" # mainstream
r"|privatehost" # pornhat
r")\.[a-z]{2,4}"
r"|(?:^|/)(?:sxyprn\.com|[\w\-]+\.eporner\.com)/",
re.IGNORECASE,
)
# IP-BOUND CDN signature — token bind do requester IP, cross-IP fetch = 403.
# Wymaga mobile WebView fallback (mobile extract z phone session, nie VPS).
# Shared KVS infrastructure across multiple hosters (luluvid movies, mypornerleak,
# latestpornvideo) — wszystkie używają tego samego CDN pool.
_IP_BOUND_CDN_RE = re.compile(
r"\b(?:"
r"premilkyway\.com" # latestpornvideo
r"|tnmr\.org" # mypornerleak legacy + luluvid movies (cdn-tnmr.org)
r"|acek-cdn\.com" # mypornerleak current
r")\b",
re.IGNORECASE,
)
class StreamLink(BaseModel):
"""Pojedynczy variant stream URL (różne quality / kontener).
`stream_url` = URL do video file (mp4/m3u8/webm) proxy-wrapped URL przez backend
VPS (`/proxy/{token}/play.ext`). Bezpieczny fallback gdy CDN binduje URL do IP
extractora (np. fpo.xxx z kt_remote_ips cookie). Bandwidth idzie przez VPS.
`direct_url` + `headers` = surowy CDN URL z headers do bezpośredniego fetchu z
urządzenia. Większość tube CDN (xhamster, redtube, watchporn, eporner) zwraca
poprawnie content gdy mobile player wysyła `Referer` + `User-Agent` z `headers`.
Mobile próbuje direct PIERWSZY gdy CDN zwróci 403/410 (IP-bound), spada na
`stream_url` (proxy). Daje 0 bandwidth na VPS-ie dla większości scen.
`embed_url` = URL do embed/hoster page (HTML, np. StreamWish, doodporn) mobile
otwiera w WebView. Type: 'hoster'.
"""
stream_url: str | None = None
embed_url: str | None = None
direct_url: str | None = None
headers: dict[str, str] | None = None
quality: str | None = None
type: str | None = None # mime/ext, np. 'video/mp4', 'application/x-mpegURL'
raw: dict[str, Any] | None = None
class ResolveOut(BaseModel):
source: PlaybackSourceOut
best: StreamLink | None = None
links: list[StreamLink] = []
movies_router = APIRouter(
prefix="/movies", tags=["movies-playback"], dependencies=[Depends(require_api_key)]
)
@movies_router.post("/{movie_id}/playback/{playback_id}/resolve", response_model=ResolveOut)
def resolve_movie_playback(
movie_id: uuid.UUID,
playback_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> ResolveOut:
"""Movies playback resolve — analog do `/scenes/{id}/playback/{pb}/resolve`.
Origin patterns:
- 'paradisehill' tylko page_url (Yii2 player wymaga login session, więc
mobile dostaje page_url jako embed_url, otwiera w WebView).
- 'mangoporn:host', 'streamporn:host', 'pandamovies:host' embed_url to URL
embedu hostera (doodstream/lulustream/rpmplay/itp.). Próbujemy lokalnie
wyciągnąć direct stream URL przez generic packer (`extract_stream_from_hoster`),
z fallback na embed-only gdy się nie uda. Mobile w PlayerScreen.WebViewMode
wyciągnie wtedy URL JS-em (jak ze scenami).
"""
from app.models.movie_playback_source import MoviePlaybackSource
pb = session.get(MoviePlaybackSource, playback_id)
if pb is None or pb.movie_id != movie_id:
raise HTTPException(status_code=404, detail="movie playback source not found")
if pb.dead_at is not None:
raise HTTPException(
status_code=410,
detail=f"playback dead: {pb.dead_reason or 'unknown'}",
)
referer = pb.page_url
links: list[StreamLink] = []
if pb.origin == "paradisehill":
# Tylko WebView fallback — paradisehill player wymaga session login dla streamu.
links = [
StreamLink(
stream_url=None,
embed_url=pb.page_url,
quality=pb.quality,
type="hoster",
raw={"origin": pb.origin},
)
]
else:
# dooplay mirror sources: spróbuj direct stream extract z hoster URL
target = pb.embed_url or pb.page_url
stream: str | None = None
try:
stream = extract_stream_from_hoster(target, referer=referer)
except HosterDead as e:
# Hoster wprost mówi "video deleted" — oznacz dead, NIE proponuj
# embed fallback (mobile ExoPlayer dostałby 404 HTML page i
# próbowałby zapisać jako .bin file; bug-report 2026-05-16
# "streamtape ściąga hurtowo pliki .bin").
pb.dead_at = datetime.now(UTC)
pb.dead_reason = str(e)[:512]
session.commit()
log.info("marked movie playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, e)
raise HTTPException(status_code=410, detail=f"playback dead: {e}") from e
except Exception as e:
log.warning("movie hoster extract failed for %s: %s", target, e)
if stream and _IP_BOUND_CDN_RE.search(stream):
# IP-bound CDN (luluvid → cdn-tnmr.org, etc.) — token bind do VPS IP,
# mobile direct = 403. Skip stream, fallback na embed_url (mobile WebView).
log.info(
"movie playback %s: stream URL IP-bound CDN — skip, WebView fallback",
pb.id,
)
stream = None
if stream:
type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4"
# Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop):
# proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True`
# idzie przez `raw` → `_proxify_link` ustawi token `i=1`.
cdn_needs_impersonate = "mxcontent.net" in stream.lower()
raw_meta: dict = {"origin": pb.origin, "host": target}
if cdn_needs_impersonate:
raw_meta["proxy_impersonate"] = True
# Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4.
# Backend extract zamknął sesję — proxy musi re-fetchować
# embed page w fresh curl_cffi session żeby re-extract mp4
# z aktualnymi cookies.
raw_meta["refetch_url"] = target
raw_meta["refetch_hoster"] = "mixdrop"
links.append(
StreamLink(
stream_url=stream,
embed_url=None,
quality=pb.quality,
type=type_hint,
raw=raw_meta,
)
)
# Zawsze dorzucamy embed jako fallback — mobile WebView może wyłapać URL z JS-a
if pb.embed_url:
links.append(
StreamLink(
stream_url=None,
embed_url=pb.embed_url,
quality=pb.quality,
type="hoster",
raw={"origin": pb.origin},
)
)
if not links:
raise HTTPException(status_code=502, detail="no playable links")
links = [_proxify_link(link, referer) for link in links]
best = _pick_best(links) if links else None
return ResolveOut(
source=PlaybackSourceOut.model_validate(pb),
best=best,
links=links,
)
def _requester_tag(request: Request) -> str:
"""Audit tag dla mark-dead: IP (X-Forwarded-For preferred dla nginx proxy)
+ skrócony User-Agent. Zapisywane w dead_reason + log dla post-mortem
gdyby leaked APK key był używany do masowego psucia danych."""
fwd = request.headers.get("x-forwarded-for", "")
ip = fwd.split(",")[0].strip() if fwd else (request.client.host if request.client else "?")
ua = (request.headers.get("user-agent") or "")[:40]
return f"ip={ip} ua={ua}"
@router.post(
"/{scene_id}/playback/{playback_id}/mark-dead",
status_code=status.HTTP_204_NO_CONTENT,
)
def mark_playback_dead(
scene_id: uuid.UUID,
playback_id: uuid.UUID,
request: Request,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""User-triggered mark dead — long-press na playback chip w mobile.
Bug-report 2026-05-12 (dd17c709): "Eporner to nie temporary. Więc długie
przytrzymanie na linku celem usunięcia też byłoby ok". Backend mark-dead
flow działał tylko dla resolve failures (HosterDead/TubePageError). User
może teraz oznaczać linki które działają backendowi (200 OK) ale broken
w praktyce (np. źle zmatchowana scena, ad-redirect, hoster zwraca placeholder).
Audit: zapisujemy requester IP+UA w dead_reason+log żeby leaked APK key
nie mógł silently masowo niszczyć danych bez ścieżki dochodzenia.
"""
pb = session.get(PlaybackSource, playback_id)
if pb is None or pb.scene_id != scene_id:
raise HTTPException(status_code=404, detail="playback source not found for scene")
if pb.dead_at is None:
tag = _requester_tag(request)
pb.dead_at = datetime.now(UTC)
pb.dead_reason = f"user-marked dead (mobile long-press) {tag}"[:512]
session.commit()
log.info("user marked playback %s dead (origin=%s %s)", pb.id, pb.origin, tag)
@movies_router.post(
"/{movie_id}/playback/{playback_id}/mark-dead",
status_code=status.HTTP_204_NO_CONTENT,
)
def mark_movie_playback_dead(
movie_id: uuid.UUID,
playback_id: uuid.UUID,
request: Request,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""User-triggered mark dead dla movie playback (long-press w MovieDetail)."""
from app.models.movie_playback_source import MoviePlaybackSource
pb = session.get(MoviePlaybackSource, playback_id)
if pb is None or pb.movie_id != movie_id:
raise HTTPException(status_code=404, detail="movie playback source not found")
if pb.dead_at is None:
tag = _requester_tag(request)
pb.dead_at = datetime.now(UTC)
pb.dead_reason = f"user-marked dead (mobile long-press) {tag}"[:512]
session.commit()
log.info("user marked movie playback %s dead (origin=%s %s)", pb.id, pb.origin, tag)
@router.post("/{scene_id}/playback/{playback_id}/resolve", response_model=ResolveOut)
def resolve_playback(
scene_id: uuid.UUID,
playback_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> ResolveOut:
pb = session.get(PlaybackSource, playback_id)
if pb is None or pb.scene_id != scene_id:
raise HTTPException(status_code=404, detail="playback source not found for scene")
if pb.dead_at is not None:
raise HTTPException(
status_code=410,
detail=f"playback source marked dead: {pb.dead_reason or 'unknown'}",
)
page_url = pb.page_url
sitetag: str | None = None
if pb.origin.startswith("pornapp:"):
# Legacy origin format — pre-pornapp-removal migration. Po Fazie 2 zostanie tylko `tube:`.
sitetag = pb.origin.split(":", 1)[1]
elif pb.origin.startswith("tube:"):
sitetag = pb.origin.split(":", 1)[1]
if sitetag is None:
raise HTTPException(
status_code=501,
detail=f"resolve not implemented for origin '{pb.origin}'",
)
try:
sources = try_extract(sitetag, page_url)
except HosterDead as e:
pb.dead_at = datetime.now(UTC)
pb.dead_reason = str(e)[:512]
session.commit()
log.info("marked playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, e)
raise HTTPException(status_code=410, detail=f"playback dead: {e}") from e
except TubePageError as e:
# Tube page is gone (404/410) — mark dead, propagate as 410. Inne 5xx → 502.
if e.status_code in (404, 410):
reason = f"tube page {e.status_code} {pb.page_url}"
pb.dead_at = datetime.now(UTC)
pb.dead_reason = reason[:512]
session.commit()
log.info("marked playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, reason)
raise HTTPException(status_code=410, detail=f"playback dead: {reason}") from e
log.warning("tube fetch http error %s for %s", e.status_code, pb.page_url)
raise HTTPException(
status_code=502,
detail=f"tube fetch failed: HTTP {e.status_code}",
) from e
if not sources:
# Extractor None — TRANSIENT failure (network glitch, tube chwilowy 503,
# ad-network response zmieniony, race condition). NIE oznaczamy `dead_at`
# bo wcześniej powodowało false-positive permanent dead dla freshporno scen
# które działały przy następnym attempt (bug-report 2026-05-12).
#
# Permanent dead idzie TYLKO z explicit signals:
# - HosterDead exception (hoster page mówi "video deleted")
# - TubePageError 404/410 (page nie istnieje)
# Reszta = transient, mobile dostaje 501 → user może retry.
log.info(
"extractor None for playback %s (origin=%s) — transient, not marking dead",
pb.id, pb.origin,
)
# 503 (not 410!) żeby mobile NIE pokazało "Tube usunął ten film" — ten kod
# jest dla permanent removal. 503 = transient, user może retry.
# Sentry filtruje HTTPException 502/503/504 w `_sentry_before_send` (main.py) —
# bez tego GOON-3 spam-floodował issue list (16 events/5h dla expected case).
raise HTTPException(
status_code=503,
detail="extraction failed temporarily — retry possible",
)
# Per-source referer: niektóre extractory (yt-dlp, embed-iframe) zwracają stream
# URL którego CDN expectuje Referera embed page'a (host iframe), nie oryginalnej
# strony tube'a. Np. 0dayxx page → watchporn.to/embed iframe → stream URL chce
# `Referer: watchporn.to/` (z `Referer: 0dayxx.com` CDN zwraca 410). StreamSource.
# referer trzyma tę informację; fallback na page_url gdy extractor nie ustawił.
proxified: list[StreamLink] = []
for s in sources:
link = _stream_source_to_link(s)
proxified.append(_proxify_link(link, s.referer or page_url))
links = proxified
best = _pick_best(links) if links else None
return ResolveOut(
source=PlaybackSourceOut.model_validate(pb),
best=best,
links=links,
)
DEFAULT_PLAYER_UA = (
"Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36"
)
def _proxify_link(link: StreamLink, referer: str) -> StreamLink:
"""Wzbogaca StreamLink o:
- `stream_url`: proxy URL (fallback gdy direct fails)
- `direct_url`: surowy CDN URL (preferowany 0 VPS bandwidth)
- `headers`: Referer + User-Agent dla direct fetch
Mobile player próbuje direct PIERWSZY, fallback na stream_url po błędzie 403/410.
"""
if not link.stream_url:
return link
from app.api.stream_proxy import make_token
raw_url = link.stream_url
# Extractor flags w raw:
# - proxy_impersonate: curl_cffi chrome JA3 (mxcontent etc.)
# - refetch_url: embed URL do re-extract gdy token expired (same-session
# cookie binding dla mixdrop). Bez tego mp4 token + brak cookies → 403.
use_impersonate = bool(link.raw and link.raw.get("proxy_impersonate"))
# force_proxy=True (extractor flag) → direct_url=proxied od razu. Dla CDN-ów
# gdzie token IS bound do VPS IP (porn00 v-acctoken, pornxp sv.porn-xp.com
# signed path) — mobile direct ZAWSZE 403, więc nie ma sensu próbować.
# Bez tego: każdy playback "mrugnie" (direct fail → fallback na proxy).
force_proxy = bool(link.raw and link.raw.get("force_proxy"))
# mobile_direct_ok=True (extractor flag) → m3u8 może iść direct do mobile bo
# CDN URL ma time-bound (nie IP-bound) signed token. Mobile ExoPlayer pobiera
# manifest+segments bezpośrednio z CDN, zero VPS bandwidth.
mobile_direct_ok = bool(link.raw and link.raw.get("mobile_direct_ok"))
# Auto-detect time-bound CDN po domain — bez per-extractor flag setting.
# Critical dla public release: wszystkie mainstream tubes (xvideos/xnxx/pornhub/
# youporn/redtube + pornhat) zwracają time-bound URLs które działają cross-IP.
if not mobile_direct_ok and raw_url and _TIME_BOUND_CDN_RE.search(raw_url):
mobile_direct_ok = True
refetch_url = (link.raw or {}).get("refetch_url")
refetch_hoster = (link.raw or {}).get("refetch_hoster")
token = make_token(
raw_url, referer, impersonate=use_impersonate,
refresh=refetch_url, refresh_hoster=refetch_hoster,
)
# Decyzja na BASIE link.type (zaufanie do extractora), z fallback path-hint.
# Pornhat: raw URL `.../get_file/.../<id>.mp4/` ale CDN 302 → HLS manifest.
# Extractor markuje type='m3u8' żeby ExoPlayer użył HlsMediaSource (bez tego
# path `.mp4` mylił player → "no extractors").
type_lower = (link.type or "").lower()
if type_lower in {"m3u8", "hls", "mpd"}:
ext = "m3u8" if type_lower in {"m3u8", "hls"} else "mpd"
elif ".m3u8" in raw_url.lower():
ext = "m3u8"
elif ".mpd" in raw_url.lower():
ext = "mpd"
else:
ext = "mp4"
proxied = f"/proxy/{token}/play.{ext}"
# `direct_url`: surowy CDN URL — mobile próbuje go PIERWSZY (0 VPS bandwidth).
# ALE: dla type=m3u8/hls/mpd manifest URL musi być rewritowany żeby segmenty/keys
# też leciały przez proxy (inne IP może też mieć rate limit/token issues), plus
# ExoPlayer wybiera extractor po URL extension — `.mp4` w direct URL pornhat
# → Mp4Extractor → fail bo content to HLS. Dla m3u8/mpd zwracamy proxied JAKO
# direct (mobile używa go bezpośrednio, 1 hop przez VPS ale to jedyny sposób
# żeby manifest+segments były spójne i ExoPlayer wybrał HlsMediaSource).
# Dla CDNs które wymagają chrome JA3 (mxcontent) direct_url też zawsze przez
# proxy — bez tego mobile direct fetch z OkHttp JA3 dostaje 403 → fallback proxy
# → extra round-trip + ExoPlayer "no extractors" przed retry.
# mobile_direct_ok overrides m3u8 default-to-proxy: gdy CDN ma time-bound token
# (nie IP-bound), mobile ExoPlayer może pobrać manifest direct bez VPS proxy.
is_manifest_type = type_lower in {"m3u8", "hls", "mpd"}
if use_impersonate or force_proxy or (is_manifest_type and not mobile_direct_ok):
direct_for_mobile = proxied
else:
direct_for_mobile = raw_url
return StreamLink(
stream_url=proxied,
embed_url=link.embed_url,
direct_url=direct_for_mobile,
headers={"Referer": referer, "User-Agent": DEFAULT_PLAYER_UA},
quality=link.quality,
type=link.type,
raw=link.raw,
)
def _stream_source_to_link(s: StreamSource) -> StreamLink:
"""Mapowanie StreamSource (z extractorów) na StreamLink (response API).
Hoster type embed_url (mobile otworzy WebView). mp4/m3u8/mpd stream_url
(mobile odtworzy w native playerze przez /proxy).
"""
is_hoster = (s.type or "").lower() == "hoster"
return StreamLink(
stream_url=None if is_hoster else s.link,
embed_url=s.link if is_hoster else None,
quality=s.quality,
type=s.type,
raw=s.raw,
)
def _pick_best(links: list[StreamLink]) -> StreamLink | None:
"""Wybiera najlepszą jakość. Preferencje:
1. Najpierw direct video (`stream_url` niepuste); fallback na embed-only gdy żaden
nie ma direct (mobile pokaże "Open in browser").
2. Najwyższe quality (parsowane jako int z '720p' / '1080p' / '4k')
3. Preferuj mp4 nad m3u8 jeśli ten sam quality (mp4 łatwiejsze dla MX Player)
"""
direct = [link for link in links if link.stream_url]
pool = direct or [link for link in links if link.embed_url]
if not pool:
return None
def score(link: StreamLink) -> tuple[int, int]:
q_int = _quality_to_int(link.quality)
url_low = (link.stream_url or link.embed_url or "").lower()
type_low = (link.type or "").lower()
is_mp4 = ".mp4" in url_low or "mp4" in type_low or "direct" in type_low
type_priority = 1 if is_mp4 else 0
return (q_int, type_priority)
return max(pool, key=score)
_QUALITY_DIGITS_RE = re.compile(r"\d+")
def _quality_to_int(q: str | None) -> int:
"""Wyciąga liczbę pikseli z różnych formatów: '720p', '1080p Full HD', '4K', 'HD'."""
if not q:
return 0
s = q.lower().strip()
if "4k" in s or "uhd" in s:
return 2160
if "2k" in s or "qhd" in s:
return 1440
m = _QUALITY_DIGITS_RE.search(s)
if m:
return int(m.group(0))
if "fhd" in s:
return 1080
if "hd" in s:
return 720
if "sd" in s:
return 480
return 0

View file

@ -0,0 +1,83 @@
"""Scene favorites — ulubione sceny (single-user, równolegle do /favorites/performers).
Endpointy:
GET /scene-favorites lista ulubionych scen (pełen SceneOut)
POST /scene-favorites/{scene_id} dodaj (idempotent)
DELETE /scene-favorites/{scene_id} usuń
"""
from __future__ import annotations
import uuid
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.api.scenes import _build_scene_out
from app.api.schemas import SceneOut
from app.auth import require_api_key
from app.db import get_session
from app.models.favorite_scene import FavoriteScene
from app.models.scene import Scene
router = APIRouter(
prefix="/scene-favorites",
tags=["scene-favorites"],
dependencies=[Depends(require_api_key)],
)
class SceneFavoriteListOut(BaseModel):
items: list[SceneOut]
total: int
class SceneFavoriteToggleOut(BaseModel):
scene_id: uuid.UUID
favorited: bool
@router.get("", response_model=SceneFavoriteListOut)
def list_scene_favorites(
session: Annotated[Session, Depends(get_session)],
) -> SceneFavoriteListOut:
rows = (
session.execute(
select(Scene, FavoriteScene)
.join(FavoriteScene, FavoriteScene.scene_id == Scene.id)
.order_by(FavoriteScene.created_at.desc())
)
.all()
)
items = [_build_scene_out(session, scene) for scene, _ in rows]
return SceneFavoriteListOut(items=items, total=len(items))
@router.post(
"/{scene_id}",
response_model=SceneFavoriteToggleOut,
status_code=status.HTTP_201_CREATED,
)
def add_scene_favorite(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SceneFavoriteToggleOut:
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
existing = session.get(FavoriteScene, scene_id)
if existing is None:
session.add(FavoriteScene(scene_id=scene_id))
return SceneFavoriteToggleOut(scene_id=scene_id, favorited=True)
@router.delete("/{scene_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_scene_favorite(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
fav = session.get(FavoriteScene, scene_id)
if fav is not None:
session.delete(fav)

960
app/api/scenes.py Normal file
View file

@ -0,0 +1,960 @@
"""GET /scenes — lista i szczegóły scen z bazy kanonicznej."""
from __future__ import annotations
import logging
import re
import uuid
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel
from sqlalchemy import distinct, exists, func, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.api.schemas import (
ExternalRefOut,
PerformerOut,
PlaybackSourceOut,
SceneListOut,
SceneOut,
StudioOut,
TagOut,
)
from app.db import get_session
from app.models.favorite_scene import FavoriteScene
from app.models.performer import Performer
from app.models.play_progress import ScenePlayProgress
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneExternalRef, ScenePerformer, SceneTag
from app.models.source import Source, SourceKind
from app.models.studio import Studio
from app.models.tag import Tag
log = logging.getLogger(__name__)
router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(require_api_key)])
_VALID_SORTS = {"created_at", "release_date", "title", "studio"}
def _split_csv(raw: str | None) -> list[str]:
if not raw:
return []
return [s.strip() for s in raw.split(",") if s.strip()]
@router.get("", response_model=SceneListOut)
def list_scenes(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None, description="Wyszukiwanie po title_normalized (trgm)"),
studio_slug: str | None = Query(default=None, description="DEPRECATED — użyj studio_slugs"),
studio_slugs: str | None = Query(
default=None, description="Comma-separated studio slugs (OR)"
),
tags: str | None = Query(
default=None,
description="Comma-separated tag slugs (AND — scena musi mieć wszystkie wybrane tagi)",
),
performer_ids: str | None = Query(
default=None,
description="Comma-separated performer UUIDs (AND — scena musi mieć wszystkich wybranych performerów)",
),
has_playback: bool | None = Query(
default=None, description="True: tylko sceny z ≥1 playback_source"
),
has_animated_thumbnail: bool | None = Query(
default=None,
description="True: tylko sceny z ≥1 playback_source z animated_thumbnail_url (hold-to-preview)",
),
min_duration_sec: int | None = Query(default=None, ge=0),
max_duration_sec: int | None = Query(default=None, ge=0),
released_within_days: int | None = Query(
default=None, ge=1,
description="Tylko sceny released w ostatnich N dniach",
),
min_quality_p: int | None = Query(
default=None, ge=1,
description=(
"Minimum quality (pixele wysokości — 2160 = 4K, 1080 = FullHD). Filtruje "
"po PlaybackSource.quality (string typu '720p' / '1080p Full HD')."
),
),
include_stubs: bool = Query(
default=False,
description=(
"False (default): ukrywa sceny-szkielety bez release_date, < 10min, "
"z jedynym playback z hqporner (~7-min Brazzers trailer clipy zalewają katalog)."
),
),
sort: str = Query(default="created_at", description="created_at|release_date|title|studio"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=200),
) -> SceneListOut:
if sort not in _VALID_SORTS:
raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")
base = select(Scene)
if q:
base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))
studio_slug_list = _split_csv(studio_slugs)
if studio_slug:
studio_slug_list.append(studio_slug)
if studio_slug_list:
base = base.where(
Scene.studio_id.in_(
select(Studio.id).where(Studio.slug.in_(studio_slug_list))
)
)
tag_slug_list = _split_csv(tags)
# AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
# exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
for slug in tag_slug_list:
base = base.where(
exists(
select(1)
.select_from(SceneTag)
.join(Tag, Tag.id == SceneTag.tag_id)
.where(SceneTag.scene_id == Scene.id, Tag.slug == slug)
)
)
perf_id_strings = _split_csv(performer_ids)
if perf_id_strings:
try:
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
except ValueError as e:
raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
# AND między performerami (analogicznie do tagów).
for pid in perf_ids:
base = base.where(
exists(
select(1)
.select_from(ScenePerformer)
.where(
ScenePerformer.scene_id == Scene.id,
ScenePerformer.performer_id == pid,
)
)
)
if has_playback is True:
# Tylko sceny z choć jednym ŻYWYM playback_source.
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
)
elif has_playback is False:
base = base.where(
~exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
)
# Blacklisty — globalne wykluczenia. Jeśli scena ma JAKIEGOKOLWIEK blacklisted
# performera, jest na blacklisted studio, lub ma JAKIKOLWIEK blacklisted tag → out.
from app.models.blacklist import (
BlacklistedPerformer,
BlacklistedStudio,
BlacklistedTag,
)
base = base.where(
~exists(
select(1)
.select_from(ScenePerformer)
.join(BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id == Scene.id)
)
)
base = base.where(
~Scene.studio_id.in_(select(BlacklistedStudio.studio_id))
)
base = base.where(
~exists(
select(1)
.select_from(SceneTag)
.join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id)
.where(SceneTag.scene_id == Scene.id)
)
)
if has_animated_thumbnail:
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.animated_thumbnail_url.isnot(None),
)
)
)
if min_duration_sec is not None:
base = base.where(Scene.duration_sec >= min_duration_sec)
if max_duration_sec is not None:
base = base.where(Scene.duration_sec <= max_duration_sec)
if released_within_days is not None:
from datetime import date, timedelta
cutoff = date.today() - timedelta(days=released_within_days)
base = base.where(Scene.release_date >= cutoff)
if min_quality_p is not None:
# PlaybackSource.quality to wolny string — szukamy liczb w prefixie ('1080p',
# '1080p Full HD', '2160p'). Heurystyka: wystarczy że scena ma JEDEN żywy
# playback z quality liczbą >= min. '4K'/'UHD' aliasujemy na 2160.
from sqlalchemy import Integer, cast, or_
numeric_q = cast(
func.coalesce(func.substring(PlaybackSource.quality, r"\d+"), "0"),
Integer,
)
conds = [numeric_q >= min_quality_p]
if min_quality_p <= 2160:
conds.append(PlaybackSource.quality.ilike("%4k%"))
conds.append(PlaybackSource.quality.ilike("%uhd%"))
base = base.where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.quality.isnot(None),
or_(*conds),
)
)
)
if not include_stubs:
# Stub scene heuristic: tube-only scena BEZ release_date AND BEZ canonical
# (TPDB/StashDB) ref AND BEZ żadnego ScenePerformer linka. ScenePerformer
# dodaje continuous worker (search-by-name → wymusza link), więc per-performer
# search-result NIGDY nie jest stub. To filtruje tylko anonymous tube-only
# sceny z newUrl/categories ingestu które nie zostały zsyntowane z performerem.
canonical_exists = exists(
select(1)
.select_from(SceneExternalRef)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id == Scene.id)
.where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
)
has_performer = exists(
select(1).where(ScenePerformer.scene_id == Scene.id)
)
# NOT stub gdy: ma canonical_ref OR ma release_date OR ma performera
base = base.where(
Scene.release_date.is_not(None) | canonical_exists | has_performer
)
# Count: dla dużych baz (~400k scen) pełny count z 3 nested EXISTS bierze ~5s.
# Liczymy total na uproszczonym query (bez stub-filter w count) — daje ~5% off
# ale jest akceptowalne dla user-facing pagination header. Items query NADAL
# ma stub-filter, więc lista pokazuje poprawne sceny. Liczba w header jest
# przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie.
if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings:
# Fast path: typowy default request (lista bez filtra) — count tylko po
# has_playback (single EXISTS, dobrze zindeksowany).
count_query = select(func.count()).select_from(
select(Scene.id).where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
).subquery()
)
total = session.execute(count_query).scalar_one()
else:
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
# Sort: zawsze tie-break po created_at desc dla determinizmu paginacji.
if sort == "release_date":
ordered = base.order_by(
Scene.release_date.desc().nullslast(), Scene.created_at.desc()
)
elif sort == "title":
ordered = base.order_by(Scene.title_normalized.asc(), Scene.created_at.desc())
elif sort == "studio":
# Sceny bez studio na końcu; w obrębie studio — najświeższe pierwsze.
ordered = (
base.outerjoin(Studio, Studio.id == Scene.studio_id)
.order_by(
Studio.name_normalized.asc().nullslast(),
Scene.release_date.desc().nullslast(),
Scene.created_at.desc(),
)
)
else: # created_at
ordered = base.order_by(
Scene.created_at.desc(), Scene.release_date.desc().nullslast()
)
rows = (
session.execute(ordered.offset((page - 1) * per_page).limit(per_page))
.scalars()
.all()
)
items = _build_scenes_out_batch(session, list(rows))
return SceneListOut(items=items, total=total, page=page, per_page=per_page)
@router.get("/{scene_id}", response_model=SceneOut)
def get_scene(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> SceneOut:
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
return _build_scene_out(session, scene)
def _needs_proxy(url: str) -> bool:
"""Wszystkie thumbnaile z playback_sources są proxowane przez backend.
Większość CDN-ów porn-tube'ów wymaga Refera (hqporner, mypornerleak/58img,
inne sxyprn/eporner CDN-y) expo-image nie wysyła Referera.
Self-hosted lub backend-internal URL-e (zaczynające się od `/`) skipujemy."""
return url.startswith("http") and not url.startswith("/proxy/")
def _wrap_image_proxy(url: str, referer: str) -> str:
"""Wraps a thumbnail URL through /proxy/img/{token}/img.jpg. Klient nie musi
znać sekretu Referer backend wstawi sam. Long TTL (30d) bo thumby
stabilne, krótkie ttl by tylko niepotrzebnie zaśmiecało cache."""
from app.api.stream_proxy import make_token
token = make_token(url, referer, ttl_sec=30 * 24 * 3600)
# Path zachowuje rozszerzenie żeby HTTP Content-Type był rozpoznany.
import os as _os
ext = _os.path.splitext(url.split("?")[0])[1].lstrip(".") or "jpg"
return f"/proxy/img/{token}/img.{ext}"
def _build_scenes_out_batch(session: Session, scenes: list[Scene]) -> list[SceneOut]:
"""Batch-fetch wszystkich relacji dla N scen w 7 zapytaniach (zamiast 7×N).
Eliminuje N+1 z `_build_scene_out` w listach scen `/scenes?per_page=24` szło
z ~9.6s do <500ms. Pojedyncza scena (`/scenes/{id}`) nadal używa `_build_scene_out`
bo overhead na batch nie ma sensu dla N=1.
"""
from collections import defaultdict
if not scenes:
return []
scene_ids = [s.id for s in scenes]
studio_ids = list({s.studio_id for s in scenes if s.studio_id is not None})
# 1) Studios
studios_by_id: dict = {}
if studio_ids:
for st in session.execute(
select(Studio).where(Studio.id.in_(studio_ids))
).scalars():
studios_by_id[st.id] = st
# 2) Performers
perf_rows = session.execute(
select(ScenePerformer, Performer)
.join(Performer, Performer.id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id.in_(scene_ids))
.order_by(ScenePerformer.position.asc().nullslast())
).all()
performers_by_scene: dict = defaultdict(list)
for sp, p in perf_rows:
performers_by_scene[sp.scene_id].append(
PerformerOut(
id=p.id,
canonical_name=p.canonical_name,
slug=p.slug,
gender=p.gender.value if p.gender else None,
as_alias=sp.as_alias,
)
)
# 3) Tags
tag_rows = session.execute(
select(SceneTag.scene_id, Tag)
.join(Tag, Tag.id == SceneTag.tag_id)
.where(SceneTag.scene_id.in_(scene_ids))
).all()
tags_by_scene: dict = defaultdict(list)
for sid, t in tag_rows:
tags_by_scene[sid].append(TagOut.model_validate(t))
# 4) External refs + sources
ref_rows = session.execute(
select(SceneExternalRef, Source)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id.in_(scene_ids))
).all()
refs_by_scene: dict = defaultdict(list)
for ref, src in ref_rows:
refs_by_scene[ref.scene_id].append(
ExternalRefOut(
source=src.name,
external_id=ref.external_id,
url=ref.url,
last_seen=ref.last_seen,
)
)
# 5) Playback sources
pb_rows = session.execute(
select(PlaybackSource)
.where(
PlaybackSource.scene_id.in_(scene_ids),
PlaybackSource.dead_at.is_(None),
)
.order_by(PlaybackSource.origin.asc())
).scalars().all()
pb_by_scene: dict = defaultdict(list)
for p in pb_rows:
out = PlaybackSourceOut.model_validate(p)
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
pb_by_scene[p.scene_id].append(out)
# 6) Progress
progress_by_scene: dict = {}
for prog in session.execute(
select(ScenePlayProgress).where(ScenePlayProgress.scene_id.in_(scene_ids))
).scalars():
progress_by_scene[prog.scene_id] = prog
# 7) Favorites
fav_scene_ids: set = set(
session.execute(
select(FavoriteScene.scene_id).where(
FavoriteScene.scene_id.in_(scene_ids)
)
).scalars()
)
out: list[SceneOut] = []
for scene in scenes:
studio_out = None
if scene.studio_id is not None and scene.studio_id in studios_by_id:
studio_out = StudioOut.model_validate(studios_by_id[scene.studio_id])
progress = progress_by_scene.get(scene.id)
out.append(
SceneOut(
id=scene.id,
title=scene.title,
slug=scene.slug,
release_date=scene.release_date,
duration_sec=scene.duration_sec,
description=scene.description,
code=scene.code,
director=scene.director,
studio=studio_out,
performers=performers_by_scene.get(scene.id, []),
tags=tags_by_scene.get(scene.id, []),
external_refs=refs_by_scene.get(scene.id, []),
playback_sources=pb_by_scene.get(scene.id, []),
created_at=scene.created_at,
last_played_at=progress.last_played_at if progress else None,
finished=progress.finished if progress else False,
position_sec=progress.position_sec if progress else 0,
is_favorite=scene.id in fav_scene_ids,
)
)
return out
def _build_scene_out(session: Session, scene: Scene) -> SceneOut:
studio_out: StudioOut | None = None
if scene.studio_id is not None:
st = session.get(Studio, scene.studio_id)
if st is not None:
studio_out = StudioOut.model_validate(st)
performer_rows = session.execute(
select(ScenePerformer, Performer)
.join(Performer, Performer.id == ScenePerformer.performer_id)
.where(ScenePerformer.scene_id == scene.id)
.order_by(ScenePerformer.position.asc().nullslast())
).all()
performers_out: list[PerformerOut] = []
for sp, performer in performer_rows:
performers_out.append(
PerformerOut(
id=performer.id,
canonical_name=performer.canonical_name,
slug=performer.slug,
gender=performer.gender.value if performer.gender else None,
as_alias=sp.as_alias,
)
)
tag_rows = (
session.execute(
select(Tag).join(SceneTag, SceneTag.tag_id == Tag.id).where(SceneTag.scene_id == scene.id)
)
.scalars()
.all()
)
tags_out = [TagOut.model_validate(t) for t in tag_rows]
ref_rows = session.execute(
select(SceneExternalRef, Source)
.join(Source, Source.id == SceneExternalRef.source_id)
.where(SceneExternalRef.scene_id == scene.id)
).all()
refs_out = [
ExternalRefOut(
source=src.name,
external_id=ref.external_id,
url=ref.url,
last_seen=ref.last_seen,
)
for ref, src in ref_rows
]
playback_rows = (
session.execute(
select(PlaybackSource)
.where(
PlaybackSource.scene_id == scene.id,
PlaybackSource.dead_at.is_(None), # ukryj martwe linki
)
.order_by(PlaybackSource.origin.asc())
)
.scalars()
.all()
)
playback_out: list[PlaybackSourceOut] = []
for p in playback_rows:
out = PlaybackSourceOut.model_validate(p)
# Wrap thumbnail URL-e przez backend image proxy gdy CDN wymaga Refera
# (hqporner — fastporndelivery zwraca 403 bez Referer headera, expo-image
# nie wysyła go domyślnie). Token ma 30-dniowy TTL bo thumby są stabilne.
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
playback_out.append(out)
progress = session.get(ScenePlayProgress, scene.id)
is_fav = session.get(FavoriteScene, scene.id) is not None
return SceneOut(
id=scene.id,
title=scene.title,
slug=scene.slug,
release_date=scene.release_date,
duration_sec=scene.duration_sec,
description=scene.description,
code=scene.code,
director=scene.director,
studio=studio_out,
performers=performers_out,
tags=tags_out,
external_refs=refs_out,
playback_sources=playback_out,
created_at=scene.created_at,
last_played_at=progress.last_played_at if progress else None,
finished=progress.finished if progress else False,
position_sec=progress.position_sec if progress else 0,
is_favorite=is_fav,
)
@router.delete("/{scene_id}/tags/{tag_id}", status_code=status.HTTP_204_NO_CONTENT)
def remove_tag_from_scene(
scene_id: uuid.UUID,
tag_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""Usuwa relację scene↔tag (np. user uznał że tag jest błędny dla tej sceny).
Idempotent: brak relacji = success. Nie kasuje samego Tag-a inne sceny mogą
z niego korzystać. Sam tag zostaje w słowniku tagów.
"""
rel = session.execute(
select(SceneTag).where(SceneTag.scene_id == scene_id, SceneTag.tag_id == tag_id)
).scalar_one_or_none()
if rel is None:
return
session.delete(rel)
session.commit()
@router.delete(
"/{scene_id}/performers/{performer_id}", status_code=status.HTTP_204_NO_CONTENT
)
def remove_performer_from_scene(
scene_id: uuid.UUID,
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
"""Usuwa relację scene↔performer (false-match dedup zostawił nie tą osobę).
Idempotent. Sama Performer zostaje. Użyteczne np. gdy fuzzy match aliasu
"Bella" wciągnął Anna Bella sceny pod Bad Bella, lub Miss Teela na xnxx
została przypisana do scen w których jej nie ma (zgłoszenia 2026-05-10).
"""
from app.models.scene import ScenePerformer
rel = session.execute(
select(ScenePerformer).where(
ScenePerformer.scene_id == scene_id,
ScenePerformer.performer_id == performer_id,
)
).scalar_one_or_none()
if rel is None:
return
session.delete(rel)
session.commit()
class EnrichTagsOut(BaseModel):
scene_id: uuid.UUID
added: int
tube_used: str | None
tags: list[str]
@router.post("/{scene_id}/enrich-tags", response_model=EnrichTagsOut)
def enrich_tags_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichTagsOut:
"""Pobiera page HTML z dowolnego tube playback_source dla tej sceny i scrape'uje
tagi (categories/tags). Dodaje brakujące do scene_tags.
Mobile wywołuje to przy otwarciu SceneDetail jeśli scena ma 0 tagów AND ma
tube source z obsługiwanym extractorem (porntrex/youporn/xvideos/xnxx/redtube/
xhamster/eporner).
Idempotent: ponowne wywołanie z tymi samymi tagami nic nie robi (UNIQUE PK
scene_tags). Konkretne tube źródło wybierane wg priority listy (mainstream
bardziej rzetelne niż aggregator).
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.tag_extract import EXTRACTORS, extract_tags
from app.models.playback_source import PlaybackSource
from app.models.tag import Tag
from app.normalize.scenes import NormalizedTag
from app.normalize.text import slugify
from app.resolve.tag_resolver import resolve_tag
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
)
).scalars().all()
# Wybierz pierwsze źródło wg priority listy które ma supported extractor
chosen: PlaybackSource | None = None
for tag in PRIORITY:
for src in sources:
if src.origin == f"tube:{tag}":
chosen = src
break
if chosen:
break
if chosen is None:
# Fallback: dowolne źródło z extractorem
for src in sources:
if src.origin.startswith("tube:"):
sitetag = src.origin.split(":", 1)[1]
if sitetag in EXTRACTORS:
chosen = src
break
if chosen is None:
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=None, tags=[])
sitetag = chosen.origin.split(":", 1)[1]
try:
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.warning("enrich-tags fetch failed for %s: %s", chosen.page_url, e)
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
tag_names = extract_tags(sitetag, r.text)
if not tag_names:
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
# Upsert: dla każdego taga utwórz/znajdź Tag, dorzuć SceneTag idempotentnie.
# Używamy PostgreSQL INSERT ... ON CONFLICT DO NOTHING zamiast ORM session.add()
# bo `resolve_tag` robi session.flush() w pętli, emitując pending SceneTag INSERT
# z poprzednich iteracji — gdy 2 concurrent enrich-tags collide na tym samym
# (scene_id, tag_id), drugi flush dostaje UniqueViolation (GOON-H, 4 events
# w 10h mimo wcześniejszego seen_tag_ids fix). ON CONFLICT skip'uje silently.
from sqlalchemy.dialects.postgresql import insert as pg_insert
added = 0
seen_tag_ids: set = set()
for name in tag_names:
norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
tag = resolve_tag(session, norm=norm)
if tag is None or tag.id in seen_tag_ids:
continue
seen_tag_ids.add(tag.id)
stmt = (
pg_insert(SceneTag.__table__)
.values(scene_id=scene_id, tag_id=tag.id, source_id=None)
.on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
)
result = session.execute(stmt)
# rowcount == 1 gdy faktycznie wstawiony, 0 gdy ON CONFLICT skip
if result.rowcount and result.rowcount > 0:
added += 1
session.commit()
return EnrichTagsOut(scene_id=scene_id, added=added, tube_used=sitetag, tags=tag_names)
class EnrichDurationOut(BaseModel):
scene_id: uuid.UUID
duration_sec: int | None
tube_used: str | None
@router.post("/{scene_id}/enrich-duration", response_model=EnrichDurationOut)
def enrich_duration_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichDurationOut:
"""Wyciąga duration z dowolnego tube playback_source — wszystkie znane tube'y
udostępniają duration na detail page (og:video:duration lub LD-JSON ISO 8601).
Mobile wywołuje to przy otwarciu SceneDetail gdy scene.duration_sec jest null
AND ma tube source. Dla dedupu duration to najsilniejszy single signal bez
niego sceny z weak title-only score capowane na 0.85 (review queue).
Idempotent: zwraca aktualne duration_sec jeśli już ustawione.
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.duration_extract import extract_duration_sec
from app.models.playback_source import PlaybackSource
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
if scene.duration_sec is not None:
return EnrichDurationOut(
scene_id=scene_id, duration_sec=scene.duration_sec, tube_used=None
)
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.like("tube:%"),
)
).scalars().all()
for src in sources:
try:
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.debug("enrich-duration fetch failed for %s: %s", src.page_url, e)
continue
d = extract_duration_sec(r.text)
if d is not None and d > 0:
scene.duration_sec = d
# Zapisz też na poziomie playback_source dla parity (przyda się jeśli
# potem dorobimy per-source duration mismatch detection).
if src.duration_sec is None:
src.duration_sec = d
session.commit()
return EnrichDurationOut(
scene_id=scene_id,
duration_sec=d,
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
)
return EnrichDurationOut(scene_id=scene_id, duration_sec=None, tube_used=None)
class EnrichStudioOut(BaseModel):
scene_id: uuid.UUID
studio_id: uuid.UUID | None
studio_name: str | None
tube_used: str | None
@router.post("/{scene_id}/enrich-studio", response_model=EnrichStudioOut)
def enrich_studio_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichStudioOut:
"""Wyciąga studio (DVD/series) z pornhat scene page'a.
Pornhat ma `class="info-video js-ajax-dvd" data-setup='{"title": "Adult Time", ...}'`
dla studio. Inne tube'y obsługiwane będą gdy znajdziemy ich pattern — na razie
tylko pornhat (najczystsze studio metadata wśród free tubes).
"""
import json as _json
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.models.playback_source import PlaybackSource
from app.models.studio import Studio
from app.normalize.text import slugify
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
if scene.studio_id is not None:
existing = session.get(Studio, scene.studio_id)
return EnrichStudioOut(
scene_id=scene_id,
studio_id=scene.studio_id,
studio_name=existing.name if existing else None,
tube_used=None,
)
chosen = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin == "tube:pornhatcom",
)
).scalars().first()
if chosen is None:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used=None)
try:
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.warning("enrich-studio fetch failed for %s: %s", chosen.page_url, e)
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
m = re.search(
r"class=\"info-video js-ajax-dvd[^\"]*\"[^>]*data-setup='([^']+)'",
r.text, re.IGNORECASE,
)
if m is None:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
try:
data = _json.loads(m.group(1))
except _json.JSONDecodeError:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
name = (data.get("title") or "").strip()
if not name:
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
slug = (data.get("dir") or "").strip() or slugify(name)
studio = session.execute(
select(Studio).where(Studio.slug == slug)
).scalar_one_or_none()
if studio is None:
studio = session.execute(
select(Studio).where(Studio.name == name)
).scalar_one_or_none()
if studio is None:
studio = Studio(name=name, slug=slug)
session.add(studio)
session.flush()
scene.studio_id = studio.id
session.commit()
return EnrichStudioOut(
scene_id=scene_id, studio_id=studio.id, studio_name=studio.name, tube_used="pornhatcom"
)
class EnrichThumbOut(BaseModel):
scene_id: uuid.UUID
thumbnail_url: str | None
tube_used: str | None
sources_updated: int
@router.post("/{scene_id}/enrich-thumbnail", response_model=EnrichThumbOut)
def enrich_thumbnail_from_tube(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> EnrichThumbOut:
"""Pobiera detail page z dowolnego tube playback_source bez thumbnail_url
i wyciąga miniaturkę (og:image / twitter:image / LD-JSON thumbnailUrl /
KVS html5player).
Update'uje WSZYSTKIE PlaybackSource'y dla tej sceny które nie mają thumb,
żeby kolejne otwarcia listy widziały miniaturę niezależnie od source pick.
Mobile auto-wywoła to przy otwarciu SceneDetail bez thumb (jak duration).
"""
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.thumb_extract import extract_thumbnail_url
from app.models.playback_source import PlaybackSource
scene = session.get(Scene, scene_id)
if scene is None:
raise HTTPException(status_code=404, detail="scene not found")
sources = session.execute(
select(PlaybackSource).where(
PlaybackSource.scene_id == scene_id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.like("tube:%"),
)
).scalars().all()
sources_with_thumb = [s for s in sources if s.thumbnail_url]
if sources_with_thumb:
# już mamy — idempotent return.
return EnrichThumbOut(
scene_id=scene_id,
thumbnail_url=sources_with_thumb[0].thumbnail_url,
tube_used=None,
sources_updated=0,
)
for src in sources:
try:
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
r.raise_for_status()
except (TubePageError, Exception) as e:
log.debug("enrich-thumbnail fetch failed for %s: %s", src.page_url, e)
continue
thumb = extract_thumbnail_url(r.text)
if thumb:
# Zapisz na wszystkich źródłach bez thumb (oszczędza duplikat fetch)
updated = 0
for s in sources:
if not s.thumbnail_url:
s.thumbnail_url = thumb
updated += 1
session.commit()
return EnrichThumbOut(
scene_id=scene_id,
thumbnail_url=thumb,
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
sources_updated=updated,
)
return EnrichThumbOut(
scene_id=scene_id, thumbnail_url=None, tube_used=None, sources_updated=0
)

127
app/api/schemas.py Normal file
View file

@ -0,0 +1,127 @@
"""Pydantic schemas eksportowane przez API."""
from __future__ import annotations
import uuid
from datetime import date, datetime
from pydantic import BaseModel, ConfigDict
class StudioOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
name: str
slug: str
network: str | None = None
class PerformerOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
canonical_name: str
slug: str
gender: str | None = None
as_alias: str | None = None
class TagOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
name: str
slug: str
class ExternalRefOut(BaseModel):
source: str
external_id: str
url: str | None = None
last_seen: datetime | None = None
class PlaybackSourceOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
origin: str
page_url: str
embed_url: str | None = None
stream_url: str | None = None
quality: str | None = None
duration_sec: int | None = None
thumbnail_url: str | None = None
animated_thumbnail_url: str | None = None
class SceneOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
title: str
slug: str | None = None
release_date: date | None = None
duration_sec: int | None = None
description: str | None = None
code: str | None = None
director: str | None = None
studio: StudioOut | None = None
performers: list[PerformerOut] = []
tags: list[TagOut] = []
external_refs: list[ExternalRefOut] = []
playback_sources: list[PlaybackSourceOut] = []
# Kiedy scena trafiła do bazy (ingest). Używane przez mobile do oznaczenia
# "NEW" na karcie scen w PerformerScenesScreen / StudioScenesScreen — gdy
# `created_at > last_seen_at` (favorite) → badge.
created_at: datetime | None = None
# Watched indicator (z `scene_play_progress`): mobile dim'uje kafelek gdy
# `finished=True`, pokazuje progress bar gdy `position_sec > 0`.
last_played_at: datetime | None = None
finished: bool = False
position_sec: int = 0
is_favorite: bool = False
class SceneListOut(BaseModel):
items: list[SceneOut]
total: int
page: int
per_page: int
class MovieChapterOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
chapter_index: int
title: str | None = None
start_sec: int | None = None
end_sec: int | None = None
scene_id: uuid.UUID | None = None
class MovieOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
title: str
slug: str | None = None
release_year: int | None = None
release_date: date | None = None
duration_sec: int | None = None
description: str | None = None
director: str | None = None
country: str | None = None
rating: float | None = None
poster_url: str | None = None
backdrop_url: str | None = None
studio: StudioOut | None = None
performers: list[PerformerOut] = []
tags: list[TagOut] = []
chapters: list[MovieChapterOut] = []
external_refs: list[ExternalRefOut] = []
playback_sources: list[PlaybackSourceOut] = []
# Used by mobile MoviesScreen NEW badge (created_at > client-stored seenSince)
# and MovieDetail favorite star.
created_at: datetime | None = None
is_favorite: bool = False
class MovieListOut(BaseModel):
items: list[MovieOut]
total: int
page: int
per_page: int

553
app/api/stream_proxy.py Normal file
View file

@ -0,0 +1,553 @@
"""Stream proxy — pomost VPS↔phone dla podpisanych URL-i CDN-ów.
Wiele hosterów (luluvids/medixiru/cdnvids/bigcdn) bindą podpisany URL do IP klienta
który fetchował embed page. Gdy backend ekstraktuje URL z VPS-a, signature
weryfikuje VPS IP telefon dostaje 403. Player na phonie kieruje requesty
*przez backend* (tym samym IP co podczas extracji) CDN sprawdza signature
poprawnie i serwuje content.
Flow:
1. /resolve packuje (url, referer) w token (HMAC-podpisany).
2. Mobile dostaje `stream_url = /proxy/{token}/master.m3u8` (lub `.mp4`).
3. ExoPlayer woła backend backend strumieniuje content z origin URL.
4. HLS: m3u8 manifest jest rewrited tak, że dziecięce segmenty/playlisty
też idą przez proxy (chained tokens).
Token: base64url(json({u: url, r: referer, exp: unix_ts})) + HMAC-SHA256
podpisany shared secret z env (`STREAM_PROXY_SECRET`). TTL 4h żeby gracz mógł
oglądać dłuższe sceny + pause/seek bez ryzyka expired token.
"""
from __future__ import annotations
import base64
import hashlib
import hmac
import json
import logging
import os
import re
import time
from typing import Annotated
from urllib.parse import urljoin, urlparse
import httpx
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import Response, StreamingResponse
from app.auth import require_api_key
router = APIRouter(prefix="/proxy", tags=["proxy"])
log = logging.getLogger(__name__)
# In-memory bandwidth counter — bytes-out per CDN domain per hour bucket.
# Restart api resetuje counter (akceptowalne — to operational metric, nie billing).
# Critical dla widzenia gdzie VPS bandwidth wycieka przed Hetzner overage.
from collections import defaultdict
from threading import Lock
_bw_counters: dict[str, dict[int, int]] = defaultdict(lambda: defaultdict(int))
_bw_lock = Lock()
def _record_proxy_bytes(target_url: str, n_bytes: int) -> None:
"""Append n_bytes to current hour bucket for given target CDN domain.
Auto-prunes buckets older than 7 days. Thread-safe."""
if n_bytes <= 0:
return
try:
host = urlparse(target_url).hostname or "unknown"
except Exception:
host = "unknown"
hour = int(time.time() // 3600)
with _bw_lock:
_bw_counters[host][hour] += n_bytes
# Prune >7d (keep counter map small)
cutoff = hour - 168
old = [h for h in _bw_counters[host] if h < cutoff]
for h in old:
del _bw_counters[host][h]
def get_bandwidth_stats(hours: int = 24) -> dict[str, int]:
"""Returns {cdn_domain: bytes_out_in_last_N_hours}, sorted desc by bytes."""
now_hour = int(time.time() // 3600)
cutoff = now_hour - hours
result: dict[str, int] = {}
with _bw_lock:
for cdn, buckets in _bw_counters.items():
total = sum(b for h, b in buckets.items() if h > cutoff)
if total > 0:
result[cdn] = total
return dict(sorted(result.items(), key=lambda kv: -kv[1]))
DEFAULT_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
TOKEN_TTL_SEC = 4 * 60 * 60 # 4h
HOP_BY_HOP = {
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
"content-encoding",
"content-length",
}
def _secret() -> bytes:
s = os.environ.get("STREAM_PROXY_SECRET") or os.environ.get("API_KEYS", "")
if not s:
raise RuntimeError("STREAM_PROXY_SECRET (or API_KEYS) must be set")
return s.encode("utf-8")
def make_token(
url: str,
referer: str | None = None,
ttl_sec: int = TOKEN_TTL_SEC,
*,
refresh: str | None = None,
refresh_hoster: str | None = None,
impersonate: bool = False,
) -> str:
"""Build proxy token.
`refresh`: URL embed page do refetch gdy `url` zwraca 4xx. Proxy odbierze
fresh stream URL z embed (np. mixdrop MDCore.wurl) gdy oryginalny token expired.
`refresh_hoster`: hoster name dla refresh logic (mixdrop / etc.) proxy
dispatch do dedicated re-extract logic.
`impersonate`: użyć curl_cffi chrome120 zamiast httpx (dla hosterów z JA3 bot
detection mxcontent, cloudflare-protected).
"""
payload: dict = {"u": url, "r": referer or "", "e": int(time.time()) + ttl_sec}
if refresh:
payload["rf"] = refresh
if refresh_hoster:
payload["rh"] = refresh_hoster
if impersonate:
payload["i"] = 1
raw = json.dumps(payload, separators=(",", ":")).encode("utf-8")
body = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii")
sig = base64.urlsafe_b64encode(
hmac.new(_secret(), raw, hashlib.sha256).digest()
).rstrip(b"=").decode("ascii")
return f"{body}.{sig}"
def parse_token(token: str) -> dict:
try:
body_b64, sig_b64 = token.split(".", 1)
except ValueError:
raise HTTPException(status_code=400, detail="malformed token") from None
raw = base64.urlsafe_b64decode(body_b64 + "==")
expected = base64.urlsafe_b64encode(
hmac.new(_secret(), raw, hashlib.sha256).digest()
).rstrip(b"=").decode("ascii")
if not hmac.compare_digest(expected, sig_b64):
raise HTTPException(status_code=403, detail="bad token sig")
payload = json.loads(raw)
if int(payload.get("e", 0)) < int(time.time()):
raise HTTPException(status_code=410, detail="token expired")
return payload
def _ascii_safe_url(url: str) -> str:
"""Encode non-ASCII chars w URL path/query, zachowując reserved chars dla URI.
httpx wymaga ASCII headers Referer z polskim/cyrillic/unicode (np. hqporner
`Honies_2.html`) wcześniej throw'ował UnicodeEncodeError (GOON-A). `quote`
z `safe=":/?#[]@!$&'()*+,;=%"` zostawia URI structure nietkniętą, tylko
enkoduje znaki spoza ASCII."""
try:
from urllib.parse import quote
return quote(url, safe=":/?#[]@!$&'()*+,;=%~")
except Exception:
return url
def _build_headers(referer: str | None) -> dict[str, str]:
h = {
"User-Agent": DEFAULT_UA,
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.9",
}
if referer:
h["Referer"] = _ascii_safe_url(referer)
try:
host = urlparse(referer).hostname
if host:
h["Origin"] = _ascii_safe_url("https://" + host)
except Exception:
pass
return h
_M3U8_URI_RE = re.compile(r'(URI=")([^"]+)(")', re.IGNORECASE)
def _rewrite_m3u8(content: str, base_url: str, referer: str | None) -> str:
"""Rewrite m3u8 manifest tak, że wszystkie sub-resourcey idą przez proxy.
HLS manifest ma:
- linie URI (segmenty .ts / sub-playlisty .m3u8) relatywne lub absolute
- tagi typu `#EXT-X-KEY:METHOD=AES-128,URI="key.bin"` — też potrzebują rewrite
Każdy URL token + /proxy/{token}/<basename>.<ext>.
"""
out: list[str] = []
for raw_line in content.splitlines():
line = raw_line.strip()
if not line:
out.append(raw_line)
continue
if line.startswith("#"):
# Match URI="..." inside #EXT-X-KEY / #EXT-X-MEDIA / etc.
def _sub(m: re.Match) -> str:
inner = urljoin(base_url, m.group(2))
t = make_token(inner, referer)
return f'{m.group(1)}/proxy/{t}/seg{m.group(3)}'
new_line = _M3U8_URI_RE.sub(_sub, raw_line)
out.append(new_line)
continue
# Resource URI line
absolute = urljoin(base_url, line)
t = make_token(absolute, referer)
# Zachowaj rozszerzenie żeby ExoPlayer rozpoznał content-type:
ext = os.path.splitext(urlparse(absolute).path)[1].lstrip(".") or "ts"
out.append(f"/proxy/{t}/seg.{ext}")
return "\n".join(out) + "\n"
@router.get("/sign")
def sign_url(
_api: Annotated[None, Depends(require_api_key)],
url: str = Query(...),
referer: str | None = Query(default=None),
) -> dict:
"""Pomocniczy endpoint dla mobile do uzyskania świeżego tokena (np. po expiry).
Normalnie /resolve zwraca już proxy URL to fallback."""
return {"token": make_token(url, referer), "expires_in": TOKEN_TTL_SEC}
@router.get("/img/{token}/{_basename:path}")
async def proxy_image(
token: str,
_basename: str,
request: Request,
) -> Response:
"""Image proxy — używany dla thumbnaili z CDN-ów wymagających Referera
(hqporner i inne porn-app sourcy). Mobile expo-image nie wysyła Referera
domyślnie, CDN zwraca 403. Backend dodaje Referer i streamuje obrazek.
Cache-Control: public,max-age=86400 thumby stabilne, klient może cachować."""
payload = parse_token(token)
target = payload["u"]
referer = payload["r"] or None
headers = _build_headers(referer)
timeout = httpx.Timeout(connect=10.0, read=30.0, write=15.0, pool=5.0)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
try:
r = await client.get(target, headers=headers)
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
# CDN connect/timeout — transient (np. Cloudflare 523 origin unreachable
# gdy upstream host jest off). Log INFO + 503, mobile renderuje placeholder.
# Bez tego Sentry dostawał setki ERROR-ów (GOON-D/6) z każdym broken
# tube'em — spam-szumiło real-issues.
log.info("img proxy connect/timeout for %s: %s", target, e)
return Response(content=b"", status_code=503, media_type="image/jpeg")
except Exception as e:
log.warning("img proxy fetch failed for %s: %s", target, e)
raise HTTPException(status_code=502, detail=f"img fetch failed: {e}") from e
if r.status_code >= 400:
# Upstream 4xx/5xx dla thumba — degraded zamiast raise (placeholder w mobile).
# GOON-5 (Cloudflare 523) i GOON-D — bezsensowny noise w Sentry, lepiej
# info log + 502 pass-through bez exception.
log.info("img proxy upstream %d for %s", r.status_code, target)
return Response(
content=b"",
status_code=502 if r.status_code >= 500 else r.status_code,
media_type="image/jpeg",
)
ct = r.headers.get("content-type", "image/jpeg")
return Response(
content=r.content,
media_type=ct,
headers={"Cache-Control": "public, max-age=86400"},
)
async def _refetch_mixdrop_url(session: "AsyncSession", embed_url: str) -> str | None:
"""Re-fetch mixdrop embed, decode P.A.C.K.E.R., extract fresh MDCore.wurl.
Cookies persist w session, użytkowane potem do mp4 GET (same-session bind).
UA + Accept wymagane bez tego mixdrop zwraca minimalny body (bez packera).
"""
import re
from yt_dlp.utils import decode_packed_codes
embed_headers = {
"User-Agent": DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
try:
r = await session.get(embed_url, headers=embed_headers, impersonate="chrome120",
timeout=15, allow_redirects=True)
if r.status_code != 200:
return None
m = re.search(r"eval\(function\(p,a,c,k,e,d\)\{.+?\}\(.+?\)\)", r.text, re.DOTALL)
if not m:
return None
decoded = decode_packed_codes(m.group(0))
url_m = re.search(r'MDCore\.wurl\s*=\s*"([^"]+\.mp4[^"]*)"', decoded)
if not url_m:
return None
url = url_m.group(1)
if url.startswith("//"):
url = "https:" + url
return url
except Exception as e:
log.warning("refetch mixdrop failed for %s: %s", embed_url, e)
return None
async def _curl_cffi_stream(
target: str,
headers: dict,
*,
refetch_url: str | None = None,
refetch_hoster: str | None = None,
) -> Response:
"""Fallback dla hosterów które detect plain httpx JA3 jako bot (mxcontent,
cloudflare-protected CDNs). curl_cffi async z chrome120 impersonate ma
identyczny TLS fingerprint jak prawdziwy Chrome CDN go przepuszcza.
Gdy `refetch_url` ustawione i mp4 GET zwraca 4xx, re-fetcha embed page
w SAME session żeby odświeżyć cookies + dostać nowy mp4 URL (same-session
bind dla mxcontent). Bez tego mixdrop mp4 token expires + brak cookies 403.
"""
from curl_cffi.requests import AsyncSession
session = AsyncSession()
try:
# Dla mixdrop: ZAWSZE refetch embed jako PIERWSZE (przed mp4) żeby session
# miała fresh cookies. Initial mp4 attempt z expired/old token + brak
# cookies = 403 + anti-bot flag w cookies → blokuje retry też.
if refetch_url and refetch_hoster == "mixdrop":
new_mp4 = await _refetch_mixdrop_url(session, refetch_url)
if new_mp4:
target = new_mp4
log.info("mixdrop fresh-extract mp4 %s", new_mp4[:80])
upstream = await session.get(
target,
headers=headers,
impersonate="chrome120",
stream=True,
timeout=120,
allow_redirects=True,
)
log.info("mixdrop mp4 fetch %s%d", target[:60], upstream.status_code)
if upstream.status_code >= 400:
await session.close()
return _upstream_error_response(upstream.status_code, dict(upstream.headers), target)
out_headers = {
k: v for k, v in upstream.headers.items() if k.lower() not in HOP_BY_HOP
}
async def streamer():
bytes_out = 0
try:
async for chunk in upstream.aiter_content():
bytes_out += len(chunk)
yield chunk
finally:
await session.close()
_record_proxy_bytes(target, bytes_out)
return StreamingResponse(
streamer(),
status_code=upstream.status_code,
headers=out_headers,
media_type=upstream.headers.get("content-type", "application/octet-stream"),
)
except Exception as e:
try:
await session.close()
except Exception:
pass
log.warning("curl_cffi proxy failed for %s: %s", target, e)
raise HTTPException(status_code=502, detail=f"proxy error: {e}") from e
@router.get("/{token}/{_basename:path}")
async def proxy_stream(
token: str,
_basename: str,
request: Request,
) -> Response:
payload = parse_token(token)
target = payload["u"]
referer = payload["r"] or None
use_impersonate = bool(payload.get("i"))
refetch_url = payload.get("rf")
refetch_hoster = payload.get("rh")
# Forwardujemy Range header (HLS/MP4 player robi byte-range fetches dla seek/preload)
headers = _build_headers(referer)
range_h = request.headers.get("range")
if range_h:
headers["Range"] = range_h
method = "GET" # ExoPlayer głównie GET; HEAD nie potrzebny — proxy zwraca pełne odpowiedzi
# Hostery które wymagają Chrome JA3 fingerprint (mxcontent / cloudflare-protected
# CDNs) — od razu używamy curl_cffi zamiast httpx żeby uniknąć 403→retry round-trip.
# Token `i=1` flag ustawiana przez extractor dla tych hostów (mixdrop.py).
if use_impersonate:
return await _curl_cffi_stream(
target, headers,
refetch_url=refetch_url, refetch_hoster=refetch_hoster,
)
# Krótszy timeout na request, ale długi read żeby streaming nie zerwał
timeout = httpx.Timeout(connect=15.0, read=120.0, write=30.0, pool=10.0)
parsed = urlparse(target)
path_lower = parsed.path.lower()
# Path-hint dla wstępnej decyzji, ale FINAL decyzja po content-type response.
# Powód: pornhat `get_file/.../<id>.mp4/` 302 → CDN m3u8 manifest mimo `.mp4`
# w path. Bez content-type check proxy traktuje jako binary, mobile dostaje
# m3u8 z RAW CDN URLs (IP-bound do VPS) → "no extractors" w ExoPlayer.
path_suggests_m3u8 = path_lower.endswith(".m3u8")
client = httpx.AsyncClient(timeout=timeout, follow_redirects=True)
try:
# Sprobój streaming send PIERWSZY — sprawdź content-type po headers,
# potem decyzja: rewrite manifest vs stream binary.
upstream = await client.send(
client.build_request(method, target, headers=headers),
stream=True,
follow_redirects=True,
)
if upstream.status_code >= 400:
status = upstream.status_code
ups_headers = dict(upstream.headers)
await upstream.aclose()
await client.aclose()
return _upstream_error_response(status, ups_headers, target)
ct = (upstream.headers.get("content-type") or "").lower()
is_m3u8 = (
path_suggests_m3u8
or "mpegurl" in ct
or "application/x-mpegurl" in ct
)
if is_m3u8:
# Manifest content — buffer fully, rewrite, return as m3u8.
body = await upstream.aread()
await upstream.aclose()
await client.aclose()
try:
rewritten = _rewrite_m3u8(body.decode("utf-8", errors="replace"),
base_url=str(upstream.url), referer=referer)
except Exception as e:
log.warning("m3u8 rewrite failed for %s: %s", target, e)
raise HTTPException(status_code=502, detail="manifest rewrite failed") from e
return Response(
content=rewritten,
media_type="application/vnd.apple.mpegurl",
headers={"Cache-Control": "no-store"},
)
out_headers = {
k: v for k, v in upstream.headers.items() if k.lower() not in HOP_BY_HOP
}
async def streamer():
bytes_out = 0
try:
async for chunk in upstream.aiter_raw():
bytes_out += len(chunk)
yield chunk
finally:
await upstream.aclose()
await client.aclose()
_record_proxy_bytes(target, bytes_out)
return StreamingResponse(
streamer(),
status_code=upstream.status_code,
headers=out_headers,
media_type=upstream.headers.get("content-type", "application/octet-stream"),
)
except HTTPException:
await client.aclose()
raise
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
# CDN connect failure / timeout — transient, log na INFO (nie ERROR do Sentry).
# Zwracamy 503 zamiast 502 + Retry-After, mobile może retry-ować bez panic.
await client.aclose()
log.info("proxy connect/timeout for %s: %s", target, e)
return Response(
content=f"upstream unreachable: {type(e).__name__}",
status_code=503,
headers={"Retry-After": "5"},
media_type="text/plain",
)
except Exception as e:
await client.aclose()
log.warning("proxy failed for %s: %s", target, e)
raise HTTPException(status_code=502, detail=f"proxy error: {e}") from e
def _upstream_error_response(
status: int,
upstream_headers: dict,
target: str,
) -> Response:
"""Mapuje upstream HTTP error na nasz response.
Rationale per status:
- **429 Too Many Requests**: CDN rate-limit (np. fpo.xxx gdy proxy hammeruje
get_file/). Pass-through 429 + Retry-After żeby mobile zrobiło backoff.
Log INFO (nie ERROR) to expected behavior CDN-a, nie nasz bug.
- **404/410**: video deleted/expired token. Pass-through żeby player wiedział.
- **5xx upstream**: pochodzi z CDN-a, nie z naszego kodu. Log INFO.
- **inne 4xx**: 502 (i Sentry warn) może być nasza wina (bad referer itp.).
"""
retry_after = upstream_headers.get("retry-after") or upstream_headers.get("Retry-After")
if status == 429:
log.info("proxy upstream 429 for %s (Retry-After=%s)", target, retry_after)
out_headers: dict[str, str] = {"Cache-Control": "no-store"}
if retry_after:
out_headers["Retry-After"] = str(retry_after)
else:
out_headers["Retry-After"] = "10"
return Response(
content="upstream rate limited",
status_code=429,
headers=out_headers,
media_type="text/plain",
)
if status in (404, 410):
log.info("proxy upstream %d for %s", status, target)
return Response(
content=f"upstream {status}",
status_code=status,
media_type="text/plain",
)
if 500 <= status < 600:
# CDN-side error (np. Cloudflare 523 — origin unreachable). Pass-through
# 502 ale log INFO bo to nie nasza wina.
log.info("proxy upstream %d for %s", status, target)
return Response(
content=f"upstream {status}",
status_code=502,
headers={"Retry-After": "5"},
media_type="text/plain",
)
# 4xx other (403 itp.) — raise żeby Sentry zarejestrował (może bug naszego kodu)
raise HTTPException(status_code=502, detail=f"upstream {status}")

597
app/api/taxonomies.py Normal file
View file

@ -0,0 +1,597 @@
"""GET /tags, /performers, /studios — listy taxonomies do filtrów na mobile.
Każdy endpoint wspiera:
- q: substring search po name_normalized (trgm fallback ilike)
- order: 'name' (alfabetycznie) | 'popular' lub 'scene_count' (po liczbie scen desc)
- page/per_page
Zwraca też scene_count żeby UI pokazywał "(123)" przy każdym tagu/performerze/studio.
"""
from __future__ import annotations
import uuid
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, ConfigDict
from sqlalchemy import and_, exists, func, select
from sqlalchemy.orm import Session
from app.auth import require_api_key
from app.db import get_session
from app.models.movie import Movie, MovieTag
from app.models.movie_playback_source import MoviePlaybackSource
from app.models.performer import Performer
from app.models.playback_source import PlaybackSource
from app.models.scene import ScenePerformer, SceneTag
from app.models.studio import Studio
from app.models.tag import Tag
router = APIRouter(tags=["taxonomies"], dependencies=[Depends(require_api_key)])
# ---- Schemas ----------------------------------------------------------
class TagCount(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
name: str
slug: str
scene_count: int = 0
class TagListOut(BaseModel):
items: list[TagCount]
total: int
page: int
per_page: int
class PerformerCount(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
canonical_name: str
slug: str
gender: str | None = None
scene_count: int = 0
class PerformerListOut(BaseModel):
items: list[PerformerCount]
total: int
page: int
per_page: int
class StudioCount(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: uuid.UUID
name: str
slug: str
network: str | None = None
scene_count: int = 0
class StudioListOut(BaseModel):
items: list[StudioCount]
total: int
page: int
per_page: int
# ---- Endpoints --------------------------------------------------------
@router.get("/tags", response_model=TagListOut)
def list_tags(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None),
order: str = Query(default="popular", description="popular|name"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=500),
for_movies: bool = Query(
default=False,
description=(
"True: zlicza wystąpienia tagu w movies (z live MoviePlaybackSource) "
"zamiast w scenes. UI używa do filtrowania movie genres."
),
),
only_with_content: bool = Query(
default=False,
description=(
"True: ukrywa tagi z 0 wystąpieniami w wybranym typie (scenes/movies)."
" Filtruje krótkie listy filtrów żeby nie pokazywać tagów-sierot."
),
),
) -> TagListOut:
if order not in ("popular", "scene_count", "name"):
raise HTTPException(status_code=400, detail="order must be 'popular' or 'name'")
if for_movies:
# Movie tag count — zliczamy tylko Movies z ≥1 live MoviePlaybackSource.
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce).
_movie_live = exists().where(
and_(
MoviePlaybackSource.movie_id == MovieTag.movie_id,
MoviePlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(MovieTag.tag_id, func.count(MovieTag.movie_id).label("c"))
.where(_movie_live)
.group_by(MovieTag.tag_id)
.subquery()
)
else:
# has_live_playback filter — zliczamy tylko sceny które user faktycznie zobaczy
# (TPDB/StashDB metadata-only stubs są do mergowania, nie do oglądania).
_live_playback = exists().where(
and_(
PlaybackSource.scene_id == SceneTag.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(SceneTag.tag_id, func.count(SceneTag.scene_id).label("c"))
.where(_live_playback)
.group_by(SceneTag.tag_id)
.subquery()
)
base = (
select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.tag_id == Tag.id)
)
if q:
base = base.where(Tag.name.ilike(f"%{q}%"))
if only_with_content:
# exists() w outerjoin nie inner-joinowałby pustych tagów. Dlatego osobny
# exists check: pasują tylko tagi z ≥1 w subquery.
base = base.where(count_sub.c.tag_id.is_not(None))
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("popular", "scene_count"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
else:
ordered = base.order_by(Tag.name.asc())
rows = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
items = [
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
for t, c in rows
]
return TagListOut(items=items, total=total, page=page, per_page=per_page)
@router.get("/performers", response_model=PerformerListOut)
def list_performers(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None, description="substring po name_normalized"),
order: str = Query(default="scene_count", description="scene_count|name"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=500),
) -> PerformerListOut:
if order not in ("scene_count", "popular", "name"):
raise HTTPException(status_code=400, detail="order must be 'scene_count' or 'name'")
# has_live_playback filter — patrz list_tags wyżej.
_perf_live_playback = exists().where(
and_(
PlaybackSource.scene_id == ScenePerformer.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id).label("c"))
.where(_perf_live_playback)
.group_by(ScenePerformer.performer_id)
.subquery()
)
base = (
select(Performer, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.performer_id == Performer.id)
)
if q:
base = base.where(Performer.name_normalized.ilike(f"%{q.lower()}%"))
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("scene_count", "popular"):
ordered = base.order_by(
func.coalesce(count_sub.c.c, 0).desc(), Performer.canonical_name.asc()
)
else:
ordered = base.order_by(Performer.canonical_name.asc())
rows = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
items = [
PerformerCount(
id=p.id,
canonical_name=p.canonical_name,
slug=p.slug,
gender=p.gender.value if p.gender else None,
scene_count=int(c),
)
for p, c in rows
]
return PerformerListOut(items=items, total=total, page=page, per_page=per_page)
@router.get("/studios", response_model=StudioListOut)
def list_studios(
session: Annotated[Session, Depends(get_session)],
q: str | None = Query(default=None),
order: str = Query(default="name", description="name|scene_count"),
page: int = Query(default=1, ge=1),
per_page: int = Query(default=50, ge=1, le=500),
for_movies: bool = Query(
default=False,
description="True: zlicza tylko studia mające ≥1 movie z live playback.",
),
only_with_content: bool = Query(
default=False,
description="True: ukrywa studia z 0 wystąpieniami w wybranym typie.",
),
) -> StudioListOut:
from app.models.scene import Scene # lokalny import — Scene FK do Studio
if order not in ("name", "scene_count", "popular"):
raise HTTPException(status_code=400, detail="order must be 'name' or 'scene_count'")
if for_movies:
_movie_live = exists().where(
and_(
MoviePlaybackSource.movie_id == Movie.id,
MoviePlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(Movie.studio_id, func.count(Movie.id).label("c"))
.where(Movie.studio_id.is_not(None))
.where(_movie_live)
.group_by(Movie.studio_id)
.subquery()
)
else:
# has_live_playback filter — patrz list_tags wyżej.
_studio_live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(Scene.studio_id, func.count(Scene.id).label("c"))
.where(Scene.studio_id.is_not(None))
.where(_studio_live_playback)
.group_by(Scene.studio_id)
.subquery()
)
base = (
select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.studio_id == Studio.id)
)
if q:
base = base.where(Studio.name.ilike(f"%{q}%"))
if only_with_content:
base = base.where(count_sub.c.studio_id.is_not(None))
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("scene_count", "popular"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
else:
ordered = base.order_by(Studio.name_normalized.asc())
rows = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
items = [
StudioCount(
id=s.id,
name=s.name,
slug=s.slug,
network=s.network,
scene_count=int(c),
)
for s, c in rows
]
return StudioListOut(items=items, total=total, page=page, per_page=per_page)
# ---- Performer refresh on-demand --------------------------------------
class PerformerRefreshOut(BaseModel):
performer_id: uuid.UUID
canonical_name: str
counters: dict[str, dict[str, int]]
new_scenes: int
last_searched_at: str | None
class PerformerRescrapeOut(BaseModel):
performer_id: uuid.UUID
canonical_name: str
scenes_total: int
scenes_processed: int
thumbs_added: int
tags_added: int
failures: int
capped: bool
cap_reason: str | None = None
# Hard caps żeby request się nie wisiał i nginx (60s read timeout) nie 504'ował
# przy partial commits. 45s wall-clock + 50 scen max = ~12 fetches × 3s budgetowo.
# Większe rescrape'y user może odpalać wielokrotnie (idempotent dzięki has_thumb/
# tag_count check).
_RESCRAPE_WALL_SEC = 55.0 # nginx read timeout 60s — 5s margin na response build
_RESCRAPE_MAX_SCENES = 50
# Re-fetch tagów dla scen z < N tagami. Niektórzy performerzy mają legit 1-2 tagi
# (niche), no harm w sprawdzeniu pierwszy raz; powtarzane wywołania są idempotent
# bo INSERT ... ON CONFLICT DO NOTHING.
_TAG_RESCRAPE_THRESHOLD = 3
# Mainstream tubes priority dla tagów — bogate metadane.
_TAG_PRIORITY = [
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
]
@router.post("/performers/{performer_id}/rescrape", response_model=PerformerRescrapeOut)
def rescrape_performer_scenes(
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> PerformerRescrapeOut:
"""Re-scrapuje miniaturki + tagi z tube pages dla scen performera (bulk).
Bug-report 2026-05-16 (6fcaa5f4): per-scene enrich działa on-demand, ale dla
całej listy (np. 200 scen xhamstera) user musiałby kliknąć każdą osobno.
Cap'owane: max `_RESCRAPE_MAX_SCENES` (50) lub `_RESCRAPE_WALL_SEC` (45s),
żeby nginx 60s read timeout nie 504'ował partial commit. Większe ilości
wymagają wielu kliknięć (idempotent, scene z thumb się skipuje).
Idempotent: scena która ma już thumb i 3 tagi jest pomijana.
"""
import time as _time
import httpx as _httpx
from app.extractors._fetch import browser_get
from app.extractors._models import TubePageError
from app.extractors.tag_extract import EXTRACTORS as TAG_EXTRACTORS, extract_tags
from app.extractors.thumb_extract import extract_thumbnail_url
from app.models.playback_source import PlaybackSource
from app.models.scene import Scene, SceneTag
from app.normalize.scenes import NormalizedTag
from app.normalize.text import slugify
from app.resolve.tag_resolver import resolve_tag
from sqlalchemy.dialects.postgresql import insert as pg_insert
perf = session.get(Performer, performer_id)
if perf is None:
raise HTTPException(status_code=404, detail="performer not found")
# 1) ID-only query — sceny ze ≥1 alive tube playback.
scene_ids = session.execute(
select(Scene.id)
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
.where(ScenePerformer.performer_id == performer_id)
.where(
exists().where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
PlaybackSource.origin.like("tube:%"),
)
)
.limit(_RESCRAPE_MAX_SCENES)
).scalars().all()
scenes_total = len(scene_ids)
if not scene_ids:
return PerformerRescrapeOut(
performer_id=performer_id,
canonical_name=perf.canonical_name,
scenes_total=0, scenes_processed=0,
thumbs_added=0, tags_added=0, failures=0,
capped=False,
)
# 2) Batch fetch: wszystkie alive tube playback_sources dla tych scen w 1 query.
pb_rows = session.execute(
select(PlaybackSource)
.where(PlaybackSource.scene_id.in_(scene_ids))
.where(PlaybackSource.dead_at.is_(None))
.where(PlaybackSource.origin.like("tube:%"))
).scalars().all()
sources_by_scene: dict = {}
for s in pb_rows:
sources_by_scene.setdefault(s.scene_id, []).append(s)
# 3) Batch fetch tag counts per scene (1 query zamiast N).
tag_counts = dict(session.execute(
select(SceneTag.scene_id, func.count())
.where(SceneTag.scene_id.in_(scene_ids))
.group_by(SceneTag.scene_id)
).all())
thumbs_added = 0
tags_added = 0
failures = 0
scenes_processed = 0
capped = False
cap_reason: str | None = None
started = _time.monotonic()
# Narrow exception set — łapiemy TYLKO oczekiwane network/parse failures.
# `Exception` catch-all blokował KeyboardInterrupt + maskował pool exhaustion.
NET_EXC = (TubePageError, _httpx.HTTPError, OSError, ValueError)
for scene_id in scene_ids:
if _time.monotonic() - started > _RESCRAPE_WALL_SEC:
capped = True
cap_reason = f"wall-clock {_RESCRAPE_WALL_SEC}s reached"
break
sources = sources_by_scene.get(scene_id, [])
if not sources:
continue
scenes_processed += 1
has_thumb = any(s.thumbnail_url for s in sources)
existing_tag_count = tag_counts.get(scene_id, 0)
# SAVEPOINT — fail isolation. Pojedyncza scena z FK violation w SceneTag
# insert nie odpaliłby outer transaction; bez nested rollback całe N scen
# po niej miałoby PendingRollbackError.
sp = session.begin_nested()
try:
if not has_thumb:
thumb_added_here = False
for src in sources:
try:
r = browser_get(src.page_url, timeout=10.0, follow_redirects=True)
except NET_EXC as e:
log.debug("rescrape thumb fetch fail %s: %s", src.page_url, e)
continue
if r.status_code >= 400:
continue
thumb = extract_thumbnail_url(r.text)
if thumb:
# Update tylko źródła z którego pochodzi thumb (single playback).
# Wcześniej apply'owalismy do wszystkich siblings — wrong-CDN
# cross-attribution (np. xhamster thumb na porntrex entry).
# `scene.thumbnail_url` w UI bierze pierwszy z thumb (mobile
# find()), więc 1 wystarczy.
session.execute(
PlaybackSource.__table__.update()
.where(PlaybackSource.id == src.id)
.where(PlaybackSource.thumbnail_url.is_(None))
.values(thumbnail_url=thumb)
)
thumbs_added += 1
thumb_added_here = True
break
if not thumb_added_here:
failures += 1
if existing_tag_count < _TAG_RESCRAPE_THRESHOLD:
chosen = None
for tag in _TAG_PRIORITY:
for src in sources:
if src.origin == f"tube:{tag}":
chosen = src
break
if chosen:
break
if chosen is None:
for src in sources:
sitetag_part = src.origin.split(":", 1)[1]
if sitetag_part in TAG_EXTRACTORS:
chosen = src
break
if chosen is not None:
sitetag_part = chosen.origin.split(":", 1)[1]
try:
r = browser_get(chosen.page_url, timeout=10.0, follow_redirects=True)
if r.status_code < 400:
tag_names = extract_tags(sitetag_part, r.text)
else:
tag_names = []
except NET_EXC as e:
log.debug("rescrape tags fetch fail %s: %s", chosen.page_url, e)
tag_names = []
seen_tag_ids: set = set()
for name in tag_names:
norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
tag = resolve_tag(session, norm=norm)
if tag is None or tag.id in seen_tag_ids:
continue
seen_tag_ids.add(tag.id)
stmt = (
pg_insert(SceneTag.__table__)
.values(scene_id=scene_id, tag_id=tag.id)
.on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
)
result = session.execute(stmt)
if result.rowcount:
tags_added += 1
sp.commit()
session.commit()
except Exception as e:
sp.rollback()
log.warning("rescrape scene %s failed: %s", scene_id, e)
failures += 1
return PerformerRescrapeOut(
performer_id=performer_id,
canonical_name=perf.canonical_name,
scenes_total=scenes_total,
scenes_processed=scenes_processed,
thumbs_added=thumbs_added,
tags_added=tags_added,
failures=failures,
capped=capped,
cap_reason=cap_reason,
)
@router.post("/performers/{performer_id}/refresh", response_model=PerformerRefreshOut)
def refresh_performer(
performer_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> PerformerRefreshOut:
"""On-demand search across all tubes dla pojedynczego performera. Synchronous —
blokujemy search skończy. Mobile pokazuje spinner.
Rate-guard: jeśli refresh był < 60s temu, zwraca cached result (HTTP 429-style
detail). Continuous worker w tle też robi swoje, więc cache jest częsty.
"""
from datetime import UTC as _UTC, datetime as _dt, timedelta as _td
perf = session.get(Performer, performer_id)
if perf is None:
raise HTTPException(status_code=404, detail="performer not found")
if perf.last_searched_at is not None:
elapsed = _dt.now(_UTC) - perf.last_searched_at
if elapsed < _td(seconds=60):
raise HTTPException(
status_code=429,
detail=f"recently searched {int(elapsed.total_seconds())}s ago, try in a bit",
)
# Lazy import — performer_driven ma ciężki connector tree
from app.scheduler.performer_driven import run_performer_driven
# NOTE: ten request blokuje request thread API na 30-90s (search across ~25 tubes).
# Akceptowalne dla self-hosted single-user. W razie potrzeby dorobić task queue.
counters_obj = run_performer_driven(
performer_ids=[performer_id],
top_n=0,
per_performer_limit=200,
)
# Update last_searched_at + counter (tak samo jak continuous worker)
perf.last_searched_at = _dt.now(_UTC)
perf.search_run_count = (perf.search_run_count or 0) + 1
session.commit()
new_total = sum(s.get("new", 0) for s in counters_obj.per_source.values())
return PerformerRefreshOut(
performer_id=performer_id,
canonical_name=perf.canonical_name,
counters=counters_obj.per_source,
new_scenes=new_total,
last_searched_at=perf.last_searched_at.isoformat() if perf.last_searched_at else None,
)

159
app/api/watch.py Normal file
View file

@ -0,0 +1,159 @@
"""Watch history + continue watching.
Single-user. Mobile pingu POST /scenes/{id}/progress przy:
- Klik Watch (position_sec=0) wciąga scenę do recent watch
- Powrót z MX z ACTION_RESULT (gdy włączone EXTRA_RETURN_RESULT) z faktyczną pozycją
Continue watching rail na home: GET /watch/recent?limit=10 top scen po last_played_at,
filtruje dead-finished (>=95% lub flag finished). Mobile pokazuje progress bar
(position_sec / duration_sec).
"""
from __future__ import annotations
import uuid
from datetime import UTC, datetime
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.api.scenes import _build_scene_out
from app.api.schemas import SceneOut
from app.auth import require_api_key
from app.db import get_session
from app.models.play_progress import ScenePlayProgress
from app.models.scene import Scene
router = APIRouter(tags=["watch"], dependencies=[Depends(require_api_key)])
class ProgressIn(BaseModel):
position_sec: int = 0
duration_sec: int | None = None
finished: bool = False
class ProgressOut(BaseModel):
scene_id: uuid.UUID
position_sec: int
duration_sec: int | None
finished: bool
last_played_at: datetime
@router.post("/scenes/{scene_id}/progress", response_model=ProgressOut)
def upsert_progress(
scene_id: uuid.UUID,
body: ProgressIn,
session: Annotated[Session, Depends(get_session)],
) -> ProgressOut:
if session.get(Scene, scene_id) is None:
raise HTTPException(status_code=404, detail="scene not found")
# PG upsert — eliminuje race condition gdy mobile wysyła progress równolegle
# (np. 2 instancje playera lub auto-save + manual save). Wcześniej `get → add →
# commit` rzucało IntegrityError(pk_scene_play_progress) przy concurrent writes.
from sqlalchemy.dialects.postgresql import insert as pg_insert
now = datetime.now(UTC)
position_sec = max(0, body.position_sec)
finished = body.finished or (
bool(body.duration_sec)
and body.duration_sec > 0
and position_sec >= int(body.duration_sec * 0.95)
)
stmt = (
pg_insert(ScenePlayProgress)
.values(
scene_id=scene_id,
position_sec=position_sec,
duration_sec=body.duration_sec,
finished=finished,
last_played_at=now,
)
.on_conflict_do_update(
index_elements=["scene_id"],
set_={
"position_sec": position_sec,
# duration_sec: zachowaj istniejący gdy body nie podaje
"duration_sec": (
body.duration_sec
if body.duration_sec is not None
else ScenePlayProgress.duration_sec
),
"finished": finished,
"last_played_at": now,
},
)
)
session.execute(stmt)
session.commit()
row = session.get(ScenePlayProgress, scene_id)
assert row is not None
return ProgressOut(
scene_id=scene_id,
position_sec=row.position_sec,
duration_sec=row.duration_sec,
finished=row.finished,
last_played_at=row.last_played_at,
)
@router.delete(
"/scenes/{scene_id}/progress",
status_code=status.HTTP_204_NO_CONTENT,
)
def remove_progress(
scene_id: uuid.UUID,
session: Annotated[Session, Depends(get_session)],
) -> None:
row = session.get(ScenePlayProgress, scene_id)
if row is None:
return
session.delete(row)
session.commit()
class WatchEntry(BaseModel):
scene: SceneOut
position_sec: int
duration_sec: int | None
finished: bool
last_played_at: datetime
class WatchListOut(BaseModel):
items: list[WatchEntry]
@router.get("/watch/recent", response_model=WatchListOut)
def list_recent(
session: Annotated[Session, Depends(get_session)],
limit: int = Query(default=10, ge=1, le=50),
include_finished: bool = Query(default=False),
) -> WatchListOut:
"""Top-N scen po last_played_at desc. Domyślnie pomija sceny finished
(user nie chce widzieć już dograne w continue rail)."""
stmt = (
select(ScenePlayProgress, Scene)
.join(Scene, Scene.id == ScenePlayProgress.scene_id)
.order_by(desc(ScenePlayProgress.last_played_at))
.limit(limit)
)
if not include_finished:
stmt = stmt.where(ScenePlayProgress.finished.is_(False))
items: list[WatchEntry] = []
for prog, scene in session.execute(stmt).all():
items.append(
WatchEntry(
scene=_build_scene_out(session, scene),
position_sec=prog.position_sec,
duration_sec=prog.duration_sec,
finished=prog.finished,
last_played_at=prog.last_played_at,
)
)
return WatchListOut(items=items)

46
app/auth.py Normal file
View file

@ -0,0 +1,46 @@
"""API key authentication.
Klucz przyjmowany z header `X-API-Key` lub `Authorization: Bearer <key>`.
Gdy `settings.api_keys` jest puste auth jest wyłączony (dev mode).
Dodatkowo (anti-tamper): gdy `ALLOWED_APP_SIG_HASH` jest ustawione, każdy request
musi zawierać `X-App-Signature` z SHA256 (hex) signing certu APK. Mismatch 403.
Re-packaging APK innym keystorem (debug release) wykryty natychmiast.
"""
from __future__ import annotations
from fastapi import Header, HTTPException, status
from app.config import get_settings
def require_api_key(
x_api_key: str | None = Header(default=None, alias="X-API-Key"),
authorization: str | None = Header(default=None),
x_app_signature: str | None = Header(default=None, alias="X-App-Signature"),
) -> None:
settings = get_settings()
if settings.app_sig_check_enabled:
sig = (x_app_signature or "").strip().lower().replace(":", "")
if not sig or sig not in settings.allowed_app_sig_hashes:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="invalid or missing app signature",
)
if not settings.auth_enabled:
return # local/dev — wszystko otwarte
candidate: str | None = None
if x_api_key:
candidate = x_api_key.strip()
elif authorization and authorization.lower().startswith("bearer "):
candidate = authorization[7:].strip()
if not candidate or candidate not in settings.api_keys:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="invalid or missing API key",
headers={"WWW-Authenticate": "Bearer"},
)

116
app/config.py Normal file
View file

@ -0,0 +1,116 @@
from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
database_url: str = Field(
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
validation_alias="DATABASE_URL",
)
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
tpdb_base_url: str = Field(
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
)
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
stashdb_graphql_url: str = Field(
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
)
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
# 5k errors/mies wystarczy dla 1-user app.
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
sentry_traces_sample_rate: float = Field(
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
)
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
Re-packaging APK innym keystorem zmienia hash 403."""
auto_merge_threshold: float = 0.92
review_threshold: float = 0.75
fingerprint_hamming_max: int = 5
title_token_set_min: int = 88
date_window_days: int = 7
# APScheduler (M5). Każdy 0/None = job wyłączony.
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
sched_performer_driven_hours: int = Field(
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
)
sched_performer_driven_top_n: int = Field(
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
)
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
sched_performer_continuous_seconds: int = Field(
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
)
sched_performer_continuous_refresh_days: int = Field(
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
)
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
sched_movie_ingest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
)
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie.
sched_browse_latest_hours: int = Field(
default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
)
sched_browse_latest_max_pages: int = Field(
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
# Alert thresholds (% of included_traffic) — Sentry severity levels.
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
@property
def api_keys(self) -> set[str]:
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
@property
def auth_enabled(self) -> bool:
return bool(self.api_keys)
@property
def allowed_app_sig_hashes(self) -> set[str]:
return {
h.strip().lower().replace(":", "")
for h in self.allowed_app_sig_hashes_raw.split(",")
if h.strip()
}
@property
def app_sig_check_enabled(self) -> bool:
return bool(self.allowed_app_sig_hashes)
@lru_cache
def get_settings() -> Settings:
return Settings()

View file

@ -0,0 +1,48 @@
"""Connector registry helpers.
Lazy factories importy connectorów wykonują się dopiero w `get_movie_connectors()`
żeby uniknąć circular imports (modeles/db). Każdy entry: `(name, class)` w porządku
ingestu (primary FIRST, mirrory potem `resolve_movie` wtedy ma do czego dokleić
mirror playback sources).
## Jak dodać nowe movie site
1. Napisz subclass `DooplayConnector` w `app/connectors/dooplay.py` (jeśli site używa
dooplay/PsyPlay WP theme) wystarczy `name` + `base_url`. Jeśli inny theme,
napisz osobny connector implementujący `BaseMovieConnector.fetch_movies()`.
2. Dodaj entry do `_MOVIE_CONNECTORS` poniżej.
3. Backend job `_job_movie_ingest` w `app/scheduler/jobs.py` automatycznie weźmie
nowy connector przy następnym tick (24h domyślnie).
4. Do ad-hoc backfillu: `python -m app.scheduler.worker --once --strategy=movies
--performers=<nowa_nazwa>`.
## Czemu paradisehill first
Paradisehill jest jedynym sourcem z chapter markerami i pełnym metadata (director,
rating, country) idealnie kanoniczny. Dooplay mirrory rzadko mają chaptery i
release_year zwykle pusty. Resolver `resolve_movie` po title-similarity matchuje
mirror primary paradisehill, dodając tylko playback sources (mangoporn:luluvid,
:voe, ) które rozpakowują się na bezpośredni stream URL przez
`extract_stream_from_hoster`.
"""
from __future__ import annotations
def get_movie_connectors() -> list[tuple[str, type]]:
"""Zwraca listę (name, ConnectorCls) tuples w kolejności ingestu.
Lazy import uniknięcie circular import bo connectory zaczepiają db/models.
"""
from app.connectors.dooplay import (
MangopornConnector,
PandamoviesConnector,
StreampornConnector,
)
from app.connectors.paradisehill import ParadisehillConnector
return [
("paradisehill", ParadisehillConnector),
("streamporn", StreampornConnector),
("pandamovies", PandamoviesConnector),
("mangoporn", MangopornConnector),
]

187
app/connectors/base.py Normal file
View file

@ -0,0 +1,187 @@
"""Kontrakt connectora źródła + neutralne DTO surowych rekordów.
Connector odpowiada za: paginację, retry, autoryzację, deltę. Zwraca strumień RawScene
(z ewentualnymi pre-rozwiniętymi performerami/studiem/tagami w polach inline). Cała
mechanika DB i normalizacji żyje wyżej w pipeline'ie ingest.
"""
from __future__ import annotations
import abc
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
from pydantic import BaseModel, ConfigDict, Field
from app.models.source import SourceKind
class RawTag(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
slug: str | None = None
class RawStudio(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
slug: str | None = None
parent_external_id: str | None = None
parent_name: str | None = None
network: str | None = None
homepage_url: str | None = None
class RawPerformer(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str | None = None
name: str
aliases: list[str] = Field(default_factory=list)
gender: str | None = None
birth_date: date | None = None
country: str | None = None
as_alias_in_scene: str | None = None # imię użyte w tej konkretnej scenie (np. „Mia M.")
class RawFingerprint(BaseModel):
kind: str # phash | oshash | md5
value: str
class RawPlaybackSource(BaseModel):
"""Link do odtworzenia sceny z konkretnego tube/agregatora."""
model_config = ConfigDict(extra="allow")
origin: str
"""Krótka nazwa źródła, np. 'tube:hqpornercom', 'mangoporn:doodstream'."""
page_url: str
"""URL strony tube'a z player'em (deep link)."""
embed_url: str | None = None
stream_url: str | None = None
quality: str | None = None
duration_sec: int | None = None
thumbnail_url: str | None = None
animated_thumbnail_url: str | None = None
class RawScene(BaseModel):
model_config = ConfigDict(extra="allow")
external_id: str
title: str
description: str | None = None
release_date: date | None = None
duration_sec: int | None = None
code: str | None = None
director: str | None = None
url: str | None = None
studio: RawStudio | None = None
performers: list[RawPerformer] = Field(default_factory=list)
tags: list[RawTag] = Field(default_factory=list)
fingerprints: list[RawFingerprint] = Field(default_factory=list)
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
cross_source_refs: dict[str, str] = Field(default_factory=dict)
"""Mapowanie source_name → external_id deklarowane przez to źródło. Używane do path 2
w resolverze (cross-source UUID match). Klucz zgadza się z `Source.name` w DB
(np. 'tpdb', 'stashdb')."""
raw: dict[str, Any] = Field(default_factory=dict)
"""Oryginalny payload z API — leci do external_records.raw."""
class BaseConnector(abc.ABC):
"""Każde źródło dziedziczy. `kind` mapuje 1:1 na SourceKind w DB."""
kind: SourceKind
name: str
@abc.abstractmethod
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Yield po jednej scenie. `since` to delta filter (opcjonalna, fallback do full)."""
raise NotImplementedError
# ---------------------------------------------------------------------------
# Movies — odrębny encja od scen, ale ten sam wzorzec connectorów
# ---------------------------------------------------------------------------
class RawMovieChapter(BaseModel):
"""Pojedynczy rozdział filmu (movies czasem dzielą się na "Part 1/2/3" itp.).
Identyfikatory chaptera nie kanonizowane między źródłami lokalne dla movie,
indeksowane przez `chapter_index`. Może linkować do separate scene (jeśli ta scena
znana z TPDB/StashDB) tym zajmuje się normalizator wyżej."""
model_config = ConfigDict(extra="allow")
chapter_index: int
title: str | None = None
start_sec: int | None = None
end_sec: int | None = None
class RawMovie(BaseModel):
"""Surowy film z connectora — odpowiednik RawScene dla movies.
Performers / studio / tags reusable z RawPerformer / RawStudio / RawTag (te same
typy w obu pipelinach). Playback sources to lista mirrorów odtwarzania (paradisehill
primary, ewentualnie inne tube'y).
"""
model_config = ConfigDict(extra="allow")
external_id: str
title: str
description: str | None = None
release_year: int | None = None
release_date: date | None = None
duration_sec: int | None = None
director: str | None = None
country: str | None = None
rating: float | None = None
poster_url: str | None = None
backdrop_url: str | None = None
url: str | None = None
studio: RawStudio | None = None
performers: list[RawPerformer] = Field(default_factory=list)
tags: list[RawTag] = Field(default_factory=list)
chapters: list[RawMovieChapter] = Field(default_factory=list)
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
cross_source_refs: dict[str, str] = Field(default_factory=dict)
raw: dict[str, Any] = Field(default_factory=dict)
class BaseMovieConnector(abc.ABC):
"""Connector dla source'a movies (paradisehill, psyplay, wp_movies).
Symetrycznie do BaseConnector ale yielduje RawMovie. Każde źródło zna własną
paginację i format ID konwerter wyżej (resolver) dba o dedup między źródłami.
"""
kind: SourceKind
name: str
@abc.abstractmethod
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
"""Yield po jednym filmie. `since` opcjonalne, fallback do full crawl."""
raise NotImplementedError

View file

@ -0,0 +1,166 @@
"""Direct tube scrapers.
Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit
budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej
`Source(name='pornapp')` (legacy nazwa kept for DB compat) z external_id
`f"{sitetag}:{url}"`. Resolver mergeuje idempotentnie po tym kluczu.
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`
overrides w pornapp connector był specyficzny dla porn-app API i zostanie usunięty.
UWAGA speculative scrapers: większość aggregator + special tubes (xmoviesforyou,
watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak,
porndittcom, perverzija, fpoxxx, ...) ma URL templates + regex'y oparte na typowych
WordPress conventions. Wymagają post-deploy verification gdy któryś nie zwraca
wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku.
"""
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.connectors.direct_scrapers.eporner import EpornerScraper
from app.connectors.direct_scrapers.fpoxxx import FpoxxxScraper
from app.connectors.direct_scrapers.hdporn92 import HDPorn92Scraper # noqa: F401 — kept for backref; disabled
from app.connectors.direct_scrapers.hqporner import HQPornerScraper
from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper
from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
from app.connectors.direct_scrapers.pornditt import PornDittScraper
from app.connectors.direct_scrapers.porndish import PornDishScraper
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
from app.connectors.direct_scrapers.pornhub import PornHubScraper
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
from app.connectors.direct_scrapers.redtube import RedTubeScraper
from app.connectors.direct_scrapers.siska import SiskaScraper
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
from app.connectors.direct_scrapers.watchporn import WatchPornScraper
from app.connectors.direct_scrapers.xhamster import XHamsterScraper
from app.connectors.direct_scrapers.xmoviesforyou import XMoviesForYouScraper
from app.connectors.direct_scrapers.xnxx import XnxxScraper
from app.connectors.direct_scrapers.xvideos import XVideosScraper
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
from app.connectors.direct_scrapers.youporn import YouPornScraper
from app.connectors.direct_scrapers.zerodayxx import ZeroDayXXScraper
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# Existing 4 (verified, in production)
HQPornerScraper,
# HDPorn92Scraper — wyłączony 2026-05-18. Scene pages to SEO shell: ZERO player iframe
# (tylko happyleafmotion ads), JS hijackuje wszystkie kliki → `go.rmishe.com/smartpop/...`
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
SxyLandScraper,
# ZeroDayXXScraper — wyłączony 2026-05-12 (source quality report): 25,596 scen, 0.1% canonical
# match. Slug-concat tytuły (`bella reese big butt ready to be filled with cum analized`) bez
# `[Studio]` lub `Studio - Perf - Title` prefixu (parse rate 3%) → resolver nie ma żadnego
# signalu do matchu. Wraps watchporn ale dziedziczy stripped metadata. Solo orphany usunięte
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
# resolvują).
# Mainstream (URL templates well-known)
# PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
# tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
# clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
# zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
# playback_sources).
# RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
# (0.4%). Same powody co PH (skrócone clipy + amateur upload).
XVideosScraper,
XnxxScraper,
XHamsterScraper,
YouPornScraper,
PornTrexScraper,
EpornerScraper,
# Aggregators (WordPress-like ?s= search; speculative — verify post-deploy)
# XMoviesForYouScraper — wyłączony 2026-05-12 (post audit fix). 100% scen serwuje
# streamtape (DEAD_HOSTER_RE — malware drive-by .reg) + opcjonalnie playmogo/mixdrop.
# Mixdrop zrebrandował na m1xdrop.bz, yt-dlp out-of-date, packer/JS extract = fail.
# Playmogo = DoodStream CAPTCHA. Porn-app sam olewa xmoviesforyou (brak handlera w
# jadx). 1,321 solo-orphan scen.
# WatchPornScraper — wyłączony 2026-05-12 (user bug-report). Wszystkie iframes to
# DoodStream variants (playmogo/d0000d/dooood/mivalyo) z CAPTCHA gate. WebView na
# mobile = black screen (player JS nie inicjalizuje się przez Turnstile). 16%
# scen solo (no backup tube), 84% multi-source — user może użyć innego tube. yt-dlp
# nie wspiera DoodStream ("Piracy"), własny resolver TBD jeśli warto.
# SiskaScraper — wyłączony 2026-05-16 (filemoon shutdown). Każda siska scena
# embeduje filemoon iframe; filemoon.to/sx/nl serwują od ~2026-05 placeholder
# "Byse Frontend" SPA bez player JS. 14,839 playback_sources mass-marked dead.
# Plik scrapera + extractor zostają (mobile spróbuje resolve → DEAD_HOSTER_RE
# filemoon blacklist → None → 503 — fine, te scenes są też dead_at-filtered).
# SiskaScraper,
# Porn4DaysScraper — wyłączony 2026-05-12 (post audit fix). 100% scen na streamtape
# only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL =
# streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346
# solo-orphan scen.
PornDishScraper,
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
LatestPornVideoScraper,
# LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0%
# canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany
# usunięte (~15k scen).
MyPornerLeakScraper,
# Added 2026-05-12 (theporndude survey): jeden z 14 free tubes na liście który
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
# PornHatScraper — wyłączony 2026-05-18. 9,799 scen, 0.2% canonical match, 100% solo-orphan.
# Pure orphan factory — auto-screenshot thumbs nie matchują phash do canonical, slug tytuły
# nie matchują rapidfuzz, brak duration/date signals. KEEP `pornhatcom` extractor i istniejące
# playback_sources żywe — mobile może je odtwarzać; disable tylko future ingest.
# PornDittScraper — wyłączony 2026-05-12 (bug-report 64356e9b). Każdy link
# produkował nową Scene row zamiast matchować do istniejącej kanonicznej
# (TPDB/StashDB) bo pornditt ma weak signal: title + cz. performera, brak
# fingerprintu/duration/date → composite_score zawsze poniżej auto_merge
# threshold (0.92). Plik scrapera + extractor zostają (istniejące playback_sources
# nadal się resolvują, _REGISTRY w app/extractors/__init__.py odpala
# `porndittcom` → _embed_iframe.extract). Re-enable wymaga albo
# "alternative-source mode" w resolverze (match-only, never create new),
# albo bogatszej extracji metadanych (duration + fingerprint).
# Special
SxyPrnScraper,
PerverzijaScraper,
FpoxxxScraper,
]
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
# raz dziennie, pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
# `app/scheduler/browse_latest.py`.
#
# **Pilot results (2026-05-12):**
# - ShyfapScraper: 0/23 match (0%) — robi własne thumbnails ≠ canonical
# (phash Hamming 12-16). Plus rebranduje tytuły. **Wyłączony.**
# - FreshpornoScraper: 39/59 match (66%) — hot-linkuje studio thumbnaile
# (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.**
from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402
from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
FreshpornoScraper,
# PornXPScraper — pilot 2026-05-17 (20 scen): studio 100%, performer 95%,
# release_date 100%, duration 100%, stream_url 100%, phash 100%. Najlepsze
# sygnały spośród browse-mode scraperów. Stream direct mp4 (sv.porn-xp.com)
# 360/720 quality. Release year z `Released: <year>` na detail.
PornXPScraper,
# Porn00Scraper — pilot 2026-05-17 (16 scen): brak studio (0%) + brak release
# date (0%) ALE performer 100%, duration 100%, stream_url 100% (KVS video_alt_url
# 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title
# fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować.
Porn00Scraper,
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
# Follow-up: dorobić te tubey i sprawdzić phash distance:
# - fullmovies.xxx (channel/network/pornstars/categories, brak duration)
# - 4k69.com + hdporn.gg (klony freshporno — prawdopodobnie ten sam phash hit rate)
]
__all__ = [
"BaseDirectTubeScraper",
"BaseBrowseScraper",
"ALL_DIRECT_SCRAPERS",
"ALL_BROWSE_SCRAPERS",
]

View file

@ -0,0 +1,195 @@
"""BaseBrowseScraper — latest-vids browse mode (vs search-by-performer).
Wzorzec: tube'y typu shyfap/freshporno/porn00/fullmovies/pornxp mają bogatą
metadata (title, studio, performers, tags, duration, release_date, description)
na detail page'u — wystarczy do canonical fuzzy match w resolverze. Browse mode
iteruje "latest" page (sorted by upload date) i fetchuje detail per scene.
Różnica vs `BaseSearchScraper`:
- **search**: tube wyszukuje sceny po performer name (dla performer-driven
backfill). Wymaga znanego performera.
- **browse**: tube listuje newest scenes (latest-vids endpoint). Nie wymaga
żadnego query chodzi o świeże sceny independent of performer state.
Browse jest komplementarny do search:
- search łapie sceny dla **znanych performerów** (TPDB/StashDB tube)
- browse łapie **świeże sceny** których performer może być new dla nas
(nowicjuszka w branży nie jeszcze w TPDB mamy z browse później
canonical TPDB ingest mergeuje)
Subclass dostarcza HTML parsing (listing scene URLs + detail RawScene).
"""
from __future__ import annotations
import abc
import io
import logging
import re
from collections.abc import Iterator
import httpx
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
"""Subclass dostarcza listing/detail parsing. Base flow:
1. for page in 1..max_pages:
2. GET listing_url(page)
3. extract scene URLs
4. for each URL:
5. GET scene detail page
6. parse RawScene with rich metadata
7. yield
"""
_timeout: float = 30.0
"""HTTP timeout per request."""
@abc.abstractmethod
def _listing_url(self, page: int) -> str:
"""URL listing page'a 'latest-vids' (page 1 = newest)."""
@abc.abstractmethod
def _extract_scene_urls(self, listing_html: str) -> list[str]:
"""Lista absolutnych URL-i scen z listing HTML, w kolejności od najnowszej."""
@abc.abstractmethod
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
"""Parsuj scene detail HTML → RawScene z metadata.
Zwraca None gdy scena niedostępna / parse fail caller pominie ten URL,
nie aborti całe browse."""
def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]:
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
Domyślnie max_pages=5 ~100 scen per tube per run (shyfap, freshporno
~20 scen/page). Schedulowane raz dziennie catch-up po 24h przerwie.
Dedup po external_id zachodzi w resolverze (path 1 same_source) gdy
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
same N scen pojawia się przez kilka dni.
"""
# search() nie jest implementowany przez subclass dla browse-only tube'ów —
# `BaseDirectTubeScraper.search` to abstrakt, więc dodajemy stub żeby
# przepuścić abc, ale faktyczna ścieżka pracy idzie przez latest_scenes().
for page in range(1, max_pages + 1):
url = self._listing_url(page)
try:
res = browser_get(url, timeout=self._timeout)
html = res.text if hasattr(res, "text") else res
except Exception as e:
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
break
urls = self._extract_scene_urls(html)
if not urls:
log.info("%s browse: empty listing page %d, stopping", self.sitetag, page)
break
log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls))
for scene_url in urls:
try:
res = browser_get(scene_url, timeout=self._timeout)
detail_html = res.text if hasattr(res, "text") else res
except Exception as e:
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
continue
try:
raw = self._parse_detail(scene_url, detail_html)
except Exception as e:
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
continue
if raw is not None:
yield raw
# Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only
# tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y
# które chcą *oba* tryby mogą override'ować search() osobno.
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
return iter(())
_META_RE_CACHE: dict[str, re.Pattern[str]] = {}
_PHASH_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
def compute_thumbnail_phash(thumbnail_url: str, *, referer: str | None = None, timeout: float = 15.0) -> str | None:
"""Download thumbnail + return 64-bit perceptual hash (16-char hex) lub None.
Format pasuje do `SceneFingerprint.value` w DB (TPDB/StashDB importują ten sam
8x8 phash). Resolver Path 3 `find_by_phash_within` matchuje Hamming 5 (default).
Wymaga lazy importu `imagehash`/`PIL` żeby moduł browse_base importował się
nawet gdy te lib-y niedostępne (graceful degradation: phash=None resolver
spadnie do composite scoring, jak gdyby fingerprintu nie było).
"""
try:
from PIL import Image
import imagehash
except ImportError:
log.warning("imagehash/Pillow nie zainstalowane — phash skipped")
return None
headers = {"User-Agent": _PHASH_UA}
if referer:
headers["Referer"] = referer
try:
with httpx.Client(timeout=timeout, follow_redirects=True) as c:
r = c.get(thumbnail_url, headers=headers)
if r.status_code != 200 or not r.content:
return None
img = Image.open(io.BytesIO(r.content))
# phash domyślnie hash_size=8 → 64-bit hash → 16 hex chars. Mode 'L' (greyscale)
# robi to wewnętrznie, ale niektóre webp/animated mogą mieć multi-frame —
# convert() bierze pierwszą klatkę, którą imagehash i tak zredukuje do grey.
return str(imagehash.phash(img.convert("RGB")))
except Exception as e:
log.info("phash compute failed for %s: %s", thumbnail_url, e)
return None
def meta_content(html: str, *, property: str | None = None, name: str | None = None) -> str | None:
"""Wyciąga zawartość <meta property=X content=Y> lub <meta name=X content=Y>.
Standardowy helper dla scraperów które używają OpenGraph / ya:ovs / itp.
Cache compiled regex w module-scope dict (te same selectory powtarzają się).
NB: separate patterns dla `"..."` i `'...'` content quote wcześniej jeden
`[^"\']*` regex tnął title po wewnętrznym apostrofie (np. `<meta content="She's So Insatiable">`
`She`, bug-report 2026-05-20). Teraz matchujemy dokładnie ten sam quote co opening.
"""
key = f"prop:{property}" if property else f"name:{name}"
if key not in _META_RE_CACHE:
attr = "property" if property else "name"
val = re.escape(property or name or "")
# double-quoted content (HTML standard) — preferred
# Pattern: <meta property="X" content="...inner..." > — inner allows apostrophes
_META_RE_CACHE[key] = re.compile(
rf'<meta[^>]+{attr}=["\']{val}["\'][^>]*?content="([^"]*)"'
rf'|<meta[^>]+{attr}=["\']{val}["\'][^>]*?content=\'([^\']*)\'',
re.IGNORECASE,
)
m = _META_RE_CACHE[key].search(html)
if not m:
return None
val = m.group(1) if m.group(1) is not None else m.group(2)
return val.strip() if val else None

View file

@ -0,0 +1,238 @@
"""BaseSearchScraper — shared search-page HTML scraping logika.
Wzorzec stosowany przez wszystkie tube'y discovery scrapers:
1. Build search URL z `_search_url_template` (formatowane query+page).
2. Fetch HTML curl_cffi.
3. Match `_scene_url_re` (regex z grupą `url` lub group(1) jako scene URL,
opcjonalnie `slug` lub `id` jako tytuł source).
4. Filtruj wyniki po query tokens (slug musi zawierać 1 token z query)
fuzzy search tube'ów często zwraca niezwiązane wyniki.
5. Yield RawScene z `external_id=f"{sitetag}:{scene_url}"`.
Subclass override:
- `sitetag: str` np. "pornhubcom"
- `_search_url_template: str` z `{query}` i `{page}` placeholderami
- `_scene_url_re: re.Pattern[str]` regex z named group `url` (scene URL)
- `_title_from_match(match) -> str` opcjonalny override (default: derive z URL slug)
- `_token_filter_text(match) -> str` co testować na query tokens (default: cała URL)
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
# Image src extraction: matches src, data-src, data-original, data-lazy-src, data-lazy
# (lazy-load lib variants). Wymaga rozszerzenia obrazka żeby ograniczyć false positives
# (sprite icons, spinners) — JPG/PNG/WEBP są ~ jedynymi formatami które tube'y używają
# dla scene thumbnails.
_IMG_SRC_RE = re.compile(
r'<img[^>]+(?:src|data-src|data-original|data-lazy-src|data-lazy)=["\']'
r'((?://|https?://)[^"\']+\.(?:jpg|jpeg|png|webp|gif)[^"\']*)',
re.IGNORECASE,
)
log = logging.getLogger(__name__)
class BaseSearchScraper(BaseDirectTubeScraper):
"""Subclass dostarcza URL template + regex; reszta scraping flow shared.
Domyślny user agent / headers wystarczą dla ~większości tubes; te które wymagają
specyficznych (np. CF protected) override'ują `_search_headers()` lub fetch całość.
"""
#: Format URL search page'a, z `{query}` (quote_plus'ed) + `{page}` (int).
_search_url_template: str = ""
#: Regex matchujący scene URL w search HTML. Wymagana grupa `url` (full scene URL),
#: opcjonalna grupa `slug` (do title derivation gdy dostępny w URL).
_scene_url_re: re.Pattern[str] = re.compile(r"$^") # placeholder — subclass override
#: Minimalna długość tokena query do filtrowania wyników (krótsze ignorujemy żeby
#: nie matchowały niezwiązanych slugów).
_query_token_min_len: int = 3
#: Search HTTP timeout.
_timeout: float = 30.0
#: Slugi do odrzucenia (URL-e nawigacyjne / footer linki które matchują regex
#: ale nie są scenami). Przydatne dla WordPress-like tubes gdzie scene URL
#: pattern (`<host>/<slug>/`) zbiega się z `/categories/`, `/actors/` itp.
_nav_slug_blacklist: frozenset[str] = frozenset({
"actors", "actor", "actress", "categories", "category", "tags", "tag",
"feed", "dmca", "contact-us", "contact", "comments", "wp-content",
"wp-admin", "wp-includes", "wp-login.php", "page", "?filter", "?s",
"about", "about-us", "privacy", "privacy-policy", "tos", "terms",
"2257", "18-u-s-c-2257", "sitemap", "sitemap.xml",
})
#: Window (chars) wokół scene URL match, w którym szukamy `<img>` jako thumbnail.
#: WordPress-like tubes mają thumb w `<a href="..."><img src="...thumb.jpg"></a>` —
#: ±800 chars łapie ten pattern niezawodnie.
_thumbnail_window: int = 800
def _scene_url_from_match(self, m: re.Match[str]) -> str:
"""Domyślnie group(1) — subclass override gdy regex używa named groups inaczej."""
try:
return m.group("url")
except IndexError:
return m.group(1)
def _slug_from_match(self, m: re.Match[str], scene_url: str) -> str:
"""Slug do filtrowania query tokens + derivation tytułu. Default: ostatni segment URL.
Subclass override gdy regex daje explicit named group `slug`.
"""
if "slug" in m.groupdict():
slug = m.group("slug")
if slug:
return slug
# Fallback: parsuj URL
path = urllib.parse.urlparse(scene_url).path.rstrip("/")
return path.split("/")[-1] if path else ""
def _title_from_slug(self, slug: str) -> str:
return slug.replace("_", " ").replace("-", " ").strip()
def _format_query_for_url(self, query: str) -> str:
"""Default: URL-encode (spaces → `+`). Subclass override gdy tube wymaga
innego formatu np. KVS-style sites użyją slug (spaces `-`).
"""
return urllib.parse.quote_plus(query.strip())
def _fetch_scene_metadata(
self, scene_url: str
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
"""Optional hook — subclass może override żeby fetch'ować scene detail page
i wyciągnąć studio/performerów/tagi. Default zwraca None (skip detail fetch).
Wywoływane PER SCENE w `search()` dodaje +1 HTTP request per match. Subclass
powinien rzucić wyjątki swobodnie, base łapie i kontynuuje bez metadata.
Returns: (studio, performers, tags). Każde może być None / pusta lista.
"""
return None
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
if not self._search_url_template:
raise NotImplementedError(f"{type(self).__name__}._search_url_template not set")
q = self._format_query_for_url(query)
url = self._search_url_template.format(query=q, page=page)
try:
r = browser_get(url, timeout=self._timeout)
except Exception as e:
log.warning("%s search fetch failed: %s", self.sitetag, e)
return
if r.status_code != 200:
log.debug("%s search %s status=%d", self.sitetag, url, r.status_code)
return
query_tokens = {
tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len
}
seen: set[str] = set()
yielded = 0
for m in self._scene_url_re.finditer(r.text):
scene_url = self._scene_url_from_match(m).strip()
if scene_url.startswith("//"):
scene_url = "https:" + scene_url
elif scene_url.startswith("/"):
# Relative URL — prefix host z search URL.
base = urllib.parse.urlparse(url)
scene_url = f"{base.scheme}://{base.netloc}{scene_url}"
if scene_url in seen:
continue
seen.add(scene_url)
slug = self._slug_from_match(m, scene_url)
slug_lower = slug.lower()
if slug_lower in self._nav_slug_blacklist:
continue
# Strict: WSZYSTKIE query tokens muszą być w slug. Wcześniej `any()`
# przepuszczał scenę gdy choć jeden token był w slug — dla performera
# "Ava Koxxx" (query="ava koxxx") wszystkie sceny z "ava-*" slug
# (Ava Devine, Ava Addams itp.) były labelowane jako "Ava Koxxx",
# bo `any("ava" in slug)` =True. User reports: scena "ava devine
# gangbanged..." miała Ava Koxxx w DB. Fix: `all()` — slug musi
# zawierać każdy ≥3-char token z imienia performera.
if query_tokens and not all(tok in slug_lower for tok in query_tokens):
continue
title = self._title_from_slug(slug)
# Thumbnail: search ±N chars around scene_url match for nearest <img src=>.
# Większość tubes ma `<a href="<scene>"><img src="<thumb>"></a>` lub flat
# `<img src=><a href=>` — window 800 obejmuje oba.
window_start = max(0, m.start() - self._thumbnail_window)
window_end = min(len(r.text), m.end() + self._thumbnail_window)
window_html = r.text[window_start:window_end]
thumb_url: str | None = None
img_m = _IMG_SRC_RE.search(window_html)
if img_m:
thumb_url = img_m.group(1).strip()
if thumb_url.startswith("//"):
thumb_url = "https:" + thumb_url
elif thumb_url.startswith("/"):
base = urllib.parse.urlparse(url)
thumb_url = f"{base.scheme}://{base.netloc}{thumb_url}"
# Opcjonalny metadata fetch (studio/dodatkowi performerzy/tagi). Default
# zwraca None — większość tube'ów ma tylko search HTML bez metadata.
# PornHat ma `data-setup='{...}'` w `js-ajax-{dvd,model,tag}` divach.
studio: RawStudio | None = None
extra_performers: list[RawPerformer] = []
tags: list[RawTag] = []
try:
meta = self._fetch_scene_metadata(scene_url)
except Exception as e:
log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e)
meta = None
if meta is not None:
studio, extra_performers, tags = meta
# Performer z query zawsze obecny (driver scraping). Extra performers
# z detail page dorzucamy — dedupe po slug/name w resolverze.
all_performers = [RawPerformer(name=query.strip()), *extra_performers]
yield RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
thumbnail_url=thumb_url,
)
],
performers=all_performers,
studio=studio,
tags=tags,
raw={
"source": f"direct_scraper:{self.sitetag}",
"query": query,
"page": page,
"url": scene_url,
"search_url": url,
"thumbnail_url": thumb_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

View file

@ -0,0 +1,27 @@
"""BaseDirectTubeScraper — kontrakt dla bezpośrednich scraperów tube'ów."""
from __future__ import annotations
import abc
from collections.abc import Iterator
from app.connectors.base import RawScene
class BaseDirectTubeScraper(abc.ABC):
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do `Source(name='pornapp')`
żeby dziedziczyć logikę resolvera + idempotent merge per external_id."""
sitetag: str
"""Stabilny ID tube'a — używany w external_id `f"{sitetag}:{url}"`. Zgodny
z porn-app sitetag (hqpornercom, sxylandcom, itp.)."""
@abc.abstractmethod
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Search tube po query (zwykle: nazwa performera). Yield RawScene per wynik."""
raise NotImplementedError

View file

@ -0,0 +1,18 @@
"""eporner.com — direct HTML scrape search results.
Search: `https://www.eporner.com/search/<q>/<page>/` (1-indexed pages).
Scene URL: `https://www.eporner.com/hd-porn/<id>/<slug>/` lub `/video-<id>/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class EpornerScraper(BaseSearchScraper):
sitetag = "epornercom"
_search_url_template = "https://www.eporner.com/search/{query}/{page}/"
_scene_url_re = re.compile(
r'href="(?P<url>/(?:hd-porn|video-[a-z0-9]+)/(?:[a-zA-Z0-9]+/)?(?P<slug>[a-zA-Z0-9_\-]+))/?"',
)

View file

@ -0,0 +1,22 @@
"""fpoxxx — direct HTML scrape search results.
UWAGA: dokładna domena fpoxxx (sitetag w bazie) niekoniecznie zawiera "com" ani
"net" porn-app DEFAULT_SITETAGS używa "fpoxxx" jako sitetag. Best-guess: fpo.xxx.
Search: `https://fpo.xxx/page/<n>/?s=<q>` (WordPress).
Scene URL: `https://fpo.xxx/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class FpoxxxScraper(BaseSearchScraper):
sitetag = "fpoxxx"
_search_url_template = "https://fpo.xxx/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://fpo\.xxx/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,177 @@
"""freshporno.org — latest-vids browse scraper.
Pilot #2 (po shyfap fail). Hipoteza: freshporno zachowuje oryginalne studio titles
("Straighten Her Out" zamiast custom rebranding jak shyfap) title fuzzy match
do canonical zadziała. Bonus: channel = studio 1:1 (Pure Taboo, Brazzers, etc.).
URL patterns:
- Listing: `/` (page 1), `/2/`, `/3/`, ... (last `/391/` w czasie pisania)
- Scene: `/videos/<slug>/`
- Channels: `/channels/<slug>/` (= studio)
- Models: `/models/<slug>/` (= performer)
- Tags: `/tags/<slug>/` (= category)
"""
from __future__ import annotations
import re
from datetime import date, datetime, timedelta
from urllib.parse import urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://freshporno.org"
_SCENE_URL_RE = re.compile(r'href="(https://freshporno\.org/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
_CHANNEL_LINK_RE = re.compile(
r'href="https://freshporno\.org/channels/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_MODEL_LINK_RE = re.compile(
r'href="https://freshporno\.org/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://freshporno\.org/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
# Duration via <time datetime="PT46M01S"> (ISO 8601 duration). Fallback: meta property
_TIME_DURATION_RE = re.compile(r'<time[^>]+datetime="PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"', re.IGNORECASE)
def _parse_iso_duration_to_sec(html: str) -> int | None:
m = _TIME_DURATION_RE.search(html)
if not m:
return None
h = int(m.group(1) or 0)
mn = int(m.group(2) or 0)
s = int(m.group(3) or 0)
return h * 3600 + mn * 60 + s
class FreshpornoScraper(BaseBrowseScraper):
sitetag = "freshpornoorg"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/"
return f"{_BASE}/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
title = meta_content(detail_html, property="og:title")
if not title:
m = re.search(r"<title>([^<]+)</title>", detail_html, re.IGNORECASE)
if m:
title = m.group(1).strip()
if not title:
return None
description = meta_content(detail_html, property="og:description") or meta_content(
detail_html, name="description"
)
# Duration: <meta property="video:duration"> w sekundach LUB <time datetime="PT46M01S">
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
else:
duration_sec = _parse_iso_duration_to_sec(detail_html)
thumbnail_url = meta_content(detail_html, property="og:image")
# Channel = studio. Pierwszy `/channels/<slug>/` link na stronie body
# (top nav też ma channels list ale to inny pattern z `/channels/" zatrzymanym)
studio: RawStudio | None = None
# Skipnij nav linki ze stringiem "Channels" jako anchor text — bierzemy specific channel
for m in _CHANNEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if name.lower() in ("channels", ""):
continue
studio = RawStudio(
external_id=f"freshpornoorg:channel:{slug}",
name=name,
slug=slug,
)
break
# Performers — wszyscy `/models/<slug>/`
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_perf:
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"freshpornoorg:model:{slug}",
name=name,
)
)
# Tags
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
# Skip multi-tag composite slugs (freshporno czasem emituje URL-e
# typu /tags/face-sitting-fake-tits-freckles-girlfriend-... które
# są kombinacją tagów, nie pojedynczym tagiem). Normalne tagi mają
# <40 znaków, >60 to na pewno bug.
if len(slug) > 60:
continue
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"freshpornoorg:tag:{slug}", name=name, slug=slug)
)
# Phash z thumbnail. Wiemy że freshporno używa internal screenshots (preview.mp4.jpg)
# więc to też może nie matchować canonical phashy — ale test pokaże.
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,129 @@
"""fullmovies.xxx — latest-vids browse scraper.
Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos/<slug>/`,
`/networks/<slug>/`, `/models/<slug>/`, `/tags/<slug>/`. og:image to `img.fullmovies.xxx/...`
**prawdopodobnie auto-screenshot** (jak hdporn.gg 8% match). Probe potwierdzi.
"""
from __future__ import annotations
import re
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://www.fullmovies.xxx"
_SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
_NETWORK_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_MODEL_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
class FullmoviesScraper(BaseBrowseScraper):
sitetag = "fullmoviesxxx"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/latest-updates/"
return f"{_BASE}/latest-updates/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
title = meta_content(detail_html, property="og:title")
if not title:
return None
title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip()
description = meta_content(detail_html, property="og:description")
thumbnail_url = meta_content(detail_html, property="og:image")
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
studio: RawStudio | None = None
for m in _NETWORK_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if name.lower() in ("networks", ""):
continue
studio = RawStudio(
external_id=f"fullmoviesxxx:network:{slug}",
name=name,
slug=slug,
)
break
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_perf or name.lower() in ("pornstars", "models"):
continue
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name)
)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug))
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,87 @@
"""HDPorn92Scraper — direct HTML scrape hdporn92.com search.
Search: `https://hdporn92.com/page/<n>/?s=<query>`. Scene URL format:
`https://hdporn92.com/<slug>/` (jeden segment ścieżki). Trzeba odsiać
nawigację (`/categories/`, `/actors/`, `/feed/`, `/dmca/`, `/contact-us/`,
external links badoinkvr/etc.).
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_URL_RE = re.compile(r'href="(https://hdporn92\.com/([a-z0-9][a-z0-9-]+))/?"')
_NAV_SLUGS = {
"actors", "categories", "tags", "feed", "dmca", "contact-us",
"comments", "wp-content", "wp-admin", "wp-includes", "wp-login.php",
"page", "?filter", "?s",
}
class HDPorn92Scraper(BaseDirectTubeScraper):
sitetag = "hdporn92com"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://hdporn92.com/page/{page}/?s={q}"
try:
r = browser_get(url, timeout=60)
except Exception as e:
log.warning("hdporn92 search fetch failed: %s", e)
return
if r.status_code != 200:
return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/"
slug = m.group(2)
if slug in _NAV_SLUGS:
continue
if scene_url in seen:
continue
seen.add(scene_url)
slug_lower = slug.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
title = slug.replace("-", " ").strip()
yield RawScene(
external_id=f"hdporn92com:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(origin="tube:hdporn92com", page_url=scene_url)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:hdporn92",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

View file

@ -0,0 +1,142 @@
"""hdporn.gg — latest-vids browse scraper.
Engine podobny do freshporno: `/videos/<slug>/` URL, `/networks/<slug>/` = studio,
`/models/<slug>/` = performer, `/tags/<slug>/` = tag.
Quirk: og:image to internal CDN `img.hdporn.gg/...` przed merging do prod
sprawdzamy phash distance (gate-keeper: jeśli Hamming >5 dla >70% scen orphan
factory, wyłącz; analogia do shyfap).
"""
from __future__ import annotations
import re
from urllib.parse import urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://www.hdporn.gg"
_SCENE_URL_RE = re.compile(r'href="(https://www\.hdporn\.gg/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
_NETWORK_LINK_RE = re.compile(
r'href="https://www\.hdporn\.gg/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_MODEL_LINK_RE = re.compile(
r'href="https://www\.hdporn\.gg/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="https://www\.hdporn\.gg/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
)
class HDPornGGScraper(BaseBrowseScraper):
sitetag = "hdporngg"
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/latest-updates/"
return f"{_BASE}/latest-updates/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
url = m.group(1)
if url in seen:
continue
seen.add(url)
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
title = meta_content(detail_html, property="og:title")
if not title:
return None
# og:title typowo zawiera ": Free HD Porn" suffix — usuń
title = re.sub(r":\s*Free HD Porn\s*$", "", title, flags=re.IGNORECASE).strip()
# I "Brazzers - " prefix często też w title — zostaw, bo studio name w title
# to silny sygnał dla fuzzy match.
description = meta_content(detail_html, property="og:description")
thumbnail_url = meta_content(detail_html, property="og:image")
duration_sec: int | None = None
dur_meta = meta_content(detail_html, property="video:duration")
if dur_meta and dur_meta.isdigit():
duration_sec = int(dur_meta)
# Studio z /networks/. Skip nav anchors typu "Networks" / "Pornstars".
studio: RawStudio | None = None
for m in _NETWORK_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if name.lower() in ("networks", ""):
continue
# Pierwszy NETWORK link w body to studio sceny (nav sidebar też ma networks
# listę — bierzemy gdy `class="btn_sponsor_group"` lub po prostu pierwszy
# NIE z sidebara). hdporn.gg pokazuje btn_sponsor_group w main scene area.
studio = RawStudio(
external_id=f"hdporngg:network:{slug}",
name=name,
slug=slug,
)
break
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m in _MODEL_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_perf or name.lower() in ("pornstars", "models"):
continue
seen_perf.add(slug)
performers.append(
RawPerformer(external_id=f"hdporngg:model:{slug}", name=name)
)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m in _TAG_LINK_RE.finditer(detail_html):
slug, name = m.group(1), m.group(2).strip()
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(
RawTag(external_id=f"hdporngg:tag:{slug}", name=name, slug=slug)
)
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,94 @@
"""HQPornerScraper — direct HTML scrape hqporner search page.
Search URL: `https://hqporner.com/?q=<query>&p=<page>`. Static HTML zwraca ~50
linków `/hdporn/<id>-<slug>.html` per strona. Tytuł deducimy ze slug'a (porn-app
data API zwraca dokładniejszy ale wymaga round-trip dla MVP slug-derived OK,
resolver i tak je sciagnie z TPDB merge).
Search fuzzy: hqporner zwraca "Lola Noir" gdy szukamy "Noir" itp. Dlatego
filtrujemy wyniki po tym czy slug zawiera query (lub jego token) analogicznie
jak `fetch_scenes_for_search` w pornapp connectorze.
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_HREF_RE = re.compile(r'/hdporn/(\d+)-([^"\.]+)\.html')
class HQPornerScraper(BaseDirectTubeScraper):
sitetag = "hqpornercom"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://hqporner.com/?q={q}&p={page}"
try:
r = browser_get(url, timeout=30)
except Exception as e:
log.warning("hqporner search fetch failed: %s", e)
return
if r.status_code != 200:
log.debug("hqporner search %s status=%d", url, r.status_code)
return
# Filtr: slug musi zawierać przynajmniej jedno z słów query (case-insensitive)
# Eliminuje totalnie niezwiązane wyniki gdy fuzzy search szumi.
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen_urls: set[str] = set()
yielded = 0
for m in _SCENE_HREF_RE.finditer(r.text):
scene_id = m.group(1)
slug_part = m.group(2)
scene_url = f"https://hqporner.com/hdporn/{scene_id}-{slug_part}.html"
if scene_url in seen_urls:
continue
seen_urls.add(scene_url)
# Title-token filter
slug_lower = slug_part.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
title = slug_part.replace("_", " ").replace("-", " ").strip()
yield RawScene(
external_id=f"hqpornercom:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(
origin="tube:hqpornercom",
page_url=scene_url,
)
],
# Wymuszamy hint performera = query — search per performer name znaczy
# że scena prawie na pewno o nim. Resolver dorobi ScenePerformer link.
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:hqporner",
"query": query,
"page": page,
"scene_id": scene_id,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

View file

@ -0,0 +1,19 @@
"""latestleaks.co — direct HTML scrape.
Search: `https://latestleaks.co/page/<n>/?s=<q>`.
Scene URL: `https://latestleaks.co/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class LatestLeaksScraper(BaseSearchScraper):
sitetag = "latestleaksco"
_search_url_template = "https://latestleaks.co/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://latestleaks\.co/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,19 @@
"""latestpornvideo.com — direct HTML scrape.
Search: `https://latestpornvideo.com/page/<n>/?s=<q>`.
Scene URL: `https://latestpornvideo.com/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class LatestPornVideoScraper(BaseSearchScraper):
sitetag = "latestpornvideocom"
_search_url_template = "https://latestpornvideo.com/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://latestpornvideo\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,19 @@
"""mypornerleak.com — direct HTML scrape.
Search: `https://mypornerleak.com/page/<n>/?s=<q>`.
Scene URL: `https://mypornerleak.com/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class MyPornerLeakScraper(BaseSearchScraper):
sitetag = "mypornerleakcom"
_search_url_template = "https://mypornerleak.com/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://mypornerleak\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,21 @@
"""perverzija.com — direct HTML scrape search results.
Search: `https://www.perverzija.com/page/<n>/?s=<q>` (WordPress + Cloudflare).
Scene URL: `https://www.perverzija.com/<slug>/`.
CF-protected: `browser_get` (curl_cffi) bypassuje JA3 fingerprint blocks.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PerverzijaScraper(BaseSearchScraper):
sitetag = "perverzijacom"
_search_url_template = "https://www.perverzija.com/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://www\.perverzija\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,215 @@
"""porn00.org — latest-vids browse scraper.
URL patterns:
- Listing: `/latest-vids/` (page 1), `/latest-vids/2/`, ...
- Scene: `/video/<slug>/`
- Performer: `/<slug>/` (np. `/august-skye/`) w sekcji "Pornstars:" na detail
- Categories: `/category-name/<slug>/`
Sygnały dostępne:
- Title (listing card + h1 + og:title)
- Performer(s) (z sekcji "Pornstars:" na detail page pojedynczy slug per link)
- Categories (z sekcji "Categories:" `/category-name/<slug>/`)
- Duration (listing card `<div class="duration">MM:SS</div>`)
- Direct mp4 (KVS engine `video_url: 'https://www.porn00.org/get_file/.../<id>.mp4'`)
- Thumbnail (own CDN `/contents/videos_screenshots/.../1.jpg`)
BRAK:
- Studio
- Release year / data
- Description
Tytuł format: `"PerformerName - Scene Title"` (eg "August Skye - Helping Him...").
Performer name w prefixie tytułu zwykle pokrywa się z first `/pornstars/` link.
Expected pilot wynik: niski canonical match rate (~5-10%) bo brak studio/year. Direct
mp4 to bonus playback source dla scen które matchują canonical z innych źródeł.
"""
from __future__ import annotations
import logging
import re
from urllib.parse import urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
log = logging.getLogger(__name__)
_BASE = "https://www.porn00.org"
# Listing card pattern (z chrome devtools snapshot 2026-05-17):
# <div class="item">
# <a href="https://www.porn00.org/video/<slug>/" title="...">
# <img class="thumb lazy-load" src="...contents/videos_screenshots/<bucket>/<id>/320x180/1.jpg" data-cnt="5">
# </a>
# <strong class="title">Title</strong>
# <div class="duration">34:34</div>
# </div>
_LISTING_CARD_RE = re.compile(
r'<div class="item\s*">'
r'.*?<a href="(?P<url>https://www\.porn00\.org/video/[^"]+/)"\s+title="(?P<title>[^"]+)"'
r'.*?<img class="thumb[^"]*"\s+src="(?P<thumb>[^"]+)"'
r'.*?<div class="duration">(?P<dur>[^<]+)</div>',
re.IGNORECASE | re.DOTALL,
)
# Performer link pattern (porn00 konwencja): `/star-name/<slug>/`
# (analogicznie do `/category-name/`, `/tags-name/`).
_PERFORMER_LINK_RE = re.compile(
r'<a\s+href="https://www\.porn00\.org/star-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Categories: <a href="https://www.porn00.org/category-name/<slug>/">Name</a>
_CATEGORY_LINK_RE = re.compile(
r'<a\s+href="https://www\.porn00\.org/category-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Direct mp4 stream z KVS flashvars: `video_url: 'https://.../43144.mp4/?v-acctoken=...'`.
# URL może mieć cokolwiek po `.mp4`: `/?v-acctoken=...`, `?q=720p`, itp. — bierzemy
# wszystko do najbliższego `'` lub `"`.
_VIDEO_URL_RE = re.compile(
r"""video_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
)
# Wariant 720p (KVS często serwuje 360p domyślnie + 720p w `video_alt_url`).
_VIDEO_ALT_URL_RE = re.compile(
r"""video_alt_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
)
def _parse_mmss(s: str) -> int | None:
"""`34:34` → 2074, `1:20:37` → 4837."""
parts = s.strip().split(":")
try:
if len(parts) == 2:
return int(parts[0]) * 60 + int(parts[1])
if len(parts) == 3:
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
except ValueError:
return None
return None
class Porn00Scraper(BaseBrowseScraper):
sitetag = "porn00org"
def __init__(self) -> None:
super().__init__()
# Cache listing card meta — duration + thumb + title. Detail page nie ma
# tych pól w meta (brak og:duration), więc listing jest source of truth.
self._listing_cache: dict[str, dict] = {}
def _listing_url(self, page: int) -> str:
if page <= 1:
return f"{_BASE}/latest-vids/"
return f"{_BASE}/latest-vids/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
self._listing_cache = {}
seen: set[str] = set()
out: list[str] = []
for m in _LISTING_CARD_RE.finditer(listing_html):
url = m.group("url")
if url in seen:
continue
seen.add(url)
self._listing_cache[url] = {
"title": m.group("title").strip(),
"thumb": m.group("thumb"),
"duration_sec": _parse_mmss(m.group("dur") or ""),
}
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
meta = self._listing_cache.get(scene_url, {})
# Title: og:title preferowane (cleaner), fallback do listing meta.
title = meta_content(detail_html, property="og:title") or meta.get("title")
if not title:
return None
duration_sec = meta.get("duration_sec")
# Thumbnail: prefer og:image z detail (full-size preview), fallback listing 320x180.
thumb = meta_content(detail_html, property="og:image") or meta.get("thumb")
# Performers — porn00 konwencja `/star-name/<slug>/` (jak `/tags-name/`,
# `/category-name/`). Wszystkie linki tego pattern to performerzy.
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for pm in _PERFORMER_LINK_RE.finditer(detail_html):
slug = pm.group(1).lower()
if slug in seen_perf or not (2 <= len(slug) <= 60):
continue
seen_perf.add(slug)
performers.append(
RawPerformer(
external_id=f"{self.sitetag}:performer:{slug}",
name=pm.group(2).strip(),
)
)
# Categories → tags
tags: list[RawTag] = []
seen_tag: set[str] = set()
for cm in _CATEGORY_LINK_RE.finditer(detail_html):
slug = cm.group(1).lower()
if slug in seen_tag:
continue
seen_tag.add(slug)
tags.append(
RawTag(
external_id=f"{self.sitetag}:tag:{slug}",
name=cm.group(2).strip(),
slug=slug,
)
)
# Direct mp4 z KVS flashvars — preferujemy 720p (video_alt_url) nad 360p (video_url).
stream_url: str | None = None
if (vm := _VIDEO_ALT_URL_RE.search(detail_html)):
stream_url = vm.group(1)
elif (vm := _VIDEO_URL_RE.search(detail_html)):
stream_url = vm.group(1)
# Phash — porn00 robi własne screenshoty (`/contents/videos_screenshots/`),
# więc canonical phash match raczej fail. Próbujemy mimo to.
fingerprints: list[RawFingerprint] = []
if thumb:
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
duration_sec=duration_sec,
thumbnail_url=thumb,
stream_url=stream_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
duration_sec=duration_sec,
url=scene_url,
studio=None, # porn00 brak studio signal
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,19 @@
"""porn4days.pw — direct HTML scrape.
Search: `https://porn4days.pw/page/<n>/?s=<q>`.
Scene URL: `https://porn4days.pw/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class Porn4DaysScraper(BaseSearchScraper):
sitetag = "porn4dayspw"
_search_url_template = "https://porn4days.pw/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://porn4days\.pw/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,19 @@
"""porndish.com — direct HTML scrape.
Search: `https://porndish.com/page/<n>/?s=<q>`.
Scene URL: `https://porndish.com/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PornDishScraper(BaseSearchScraper):
sitetag = "porndishcom"
_search_url_template = "https://porndish.com/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,26 @@
"""pornditt.com — direct HTML scrape.
KVS-style site (kt_player engine). Search URL: `/search/<slug>/?from=<page>` z slug-style
zapytaniem (spacje `-`). Sceny renderują się na subdomenie `v.pornditt.com/videos/<id>/<slug>/`,
więc regex matchuje oba (z i bez `v.` prefix).
Sitetag `porndittcom` (legacy z porn-app DEFAULT_SITETAGS suffix-stripped name).
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PornDittScraper(BaseSearchScraper):
sitetag = "porndittcom"
_search_url_template = "https://pornditt.com/search/{query}/?from={page}"
_scene_url_re = re.compile(
r'href="(?P<url>https://(?:v\.)?pornditt\.com/videos/(?P<sid>\d+)/(?P<slug>[a-z0-9\-]+))/"',
re.IGNORECASE,
)
def _format_query_for_url(self, query: str) -> str:
# KVS slug: lowercase, spacja/interpunkcja → `-`. URL-encoded (`+`) tu nie zadziała.
return re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-")

View file

@ -0,0 +1,99 @@
"""pornhat.com — search-mode scraper (performer-driven backfill).
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
których nie mamy w TPDB jeszcze.
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
(models), i tagi do każdej sceny.
"""
from __future__ import annotations
import json
import logging
import re
from app.connectors.base import RawPerformer, RawStudio, RawTag
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
_AJAX_DATA_RE = re.compile(
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
re.IGNORECASE,
)
class PornHatScraper(BaseSearchScraper):
sitetag = "pornhatcom"
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
_scene_url_re = re.compile(
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
re.IGNORECASE,
)
def _format_query_for_url(self, query: str) -> str:
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
return query.strip().lower().replace(" ", "-")
def _fetch_scene_metadata(
self, scene_url: str
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
try:
r = browser_get(scene_url, timeout=self._timeout)
if r.status_code != 200:
return None
except Exception as e:
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
return None
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
for m in _AJAX_DATA_RE.finditer(r.text):
kind = m.group("kind").lower()
try:
data = json.loads(m.group("json"))
except json.JSONDecodeError:
continue
name = (data.get("title") or "").strip()
slug = (data.get("dir") or "").strip() or None
if not name:
continue
if kind == "dvd":
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
if studio is None:
studio = RawStudio(
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
name=name,
slug=slug,
)
elif kind == "model":
performers.append(RawPerformer(name=name))
elif kind == "tag":
tags.append(RawTag(
external_id=f"pornhatcom:tag:{slug or name.lower()}",
name=name,
slug=slug,
))
return studio, performers, tags

View file

@ -0,0 +1,24 @@
"""PornHub.com — direct HTML scrape search results.
Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PornHubScraper(BaseSearchScraper):
sitetag = "pornhubcom"
_search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
_scene_url_re = re.compile(
r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
)
def _slug_from_match(self, m, scene_url):
# Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
# Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
# się przy resolve (yt-dlp ma metadata).
return m.group("url").split("=")[-1]

View file

@ -0,0 +1,33 @@
"""PornTrex.com — direct HTML scrape search results.
Search: `https://www.porntrex.com/search/<q>/` (single page, brak ?page=).
Scene URL: `https://www.porntrex.com/video/<id>/<slug>/`
Porntrex pagination niespójne między widokami używamy `?from=<offset>` gdy page>1.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class PornTrexScraper(BaseSearchScraper):
sitetag = "porntrexcom"
_search_url_template = "https://www.porntrex.com/search/{query}/"
_scene_url_re = re.compile(
r'href="(?P<url>https://www\.porntrex\.com/video/\d+/(?P<slug>[a-z0-9_\-]+))/?"',
re.IGNORECASE,
)
def search(self, query, *, page=1, limit=None):
# Porntrex używa offset w URL gdy page > 1: `/search/<q>/?from_videos=<page>`
if page > 1:
original = self._search_url_template
self._search_url_template = f"{original.rstrip('/')}/?from_videos={page}"
try:
yield from super().search(query, page=page, limit=limit)
finally:
self._search_url_template = original
else:
yield from super().search(query, page=page, limit=limit)

View file

@ -0,0 +1,304 @@
"""pornxp.ph — latest-vids browse scraper.
URL patterns:
- Listing: `https://pornxp.ph/` (page 1, 72 cards) lub `?p=N` (pagination).
URL-e w listing mają randomized suffix per request (`/videos/94528971225` vs
`/videos/94528971837`) **`data-id` (np. `94528971`) jest stable** i tego
używamy dla external_id zamiast całego URL.
- Detail: `/videos/<id_with_suffix>`.
- Tags: `/tags/<URL-encoded-name>`. Trzy kategorie wnioskowane heurystyką
z `_classify_tag` (studio vs performer vs tag).
Rich signals (perfekt dla canonical match scoring):
- Title (`<div class="item_title">` w listing card + `<h1>` na detail)
- Studio (z `<div class="tags">` pierwszy tag z `.com`/`.co` LUB CamelCase concat)
- Performers (z tags w `<div class="tags">`, Capital + space + Capital)
- Release year (regex `Released:` na detail page bodyText)
- Duration (`<div class="item_dur">MM:SS</div>` listing card)
- Direct mp4 streams (`<source src="https://sv.porn-xp.com/.../720.mp4">`) no hoster
- Animated preview (`data-preview="//t.porn-xp.com/.../<id>.mp4"`)
Thumbnail: `<img class="item_img" src="/<id>.jpg">` relatywny, pornxp's own CDN.
Phash hit-rate niskie ale studio+performer+title fuzzy match wystarczy do canonical.
"""
from __future__ import annotations
import logging
import re
from datetime import date
from urllib.parse import unquote, urljoin
from app.connectors.base import (
RawFingerprint,
RawPerformer,
RawPlaybackSource,
RawScene,
RawStudio,
RawTag,
)
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
log = logging.getLogger(__name__)
_BASE = "https://pornxp.ph"
# Listing card — DOTALL bo HTML cards są wieloliniowe.
# Wariant 1 (eager): `<img class="item_img" src="/<id>.jpg">`
# Wariant 2 (lazy): `<img class="item_img lazy" src="/images/fluid_spinner.svg" data-src="/<id>.jpg">`
# Łapiemy obie warianty — w `_parse_listing_thumb` preferujemy `data-src` nad `src`.
_LISTING_CARD_RE = re.compile(
r'<div class="item preview"\s+data-id="(?P<id>\d+)"'
r'(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>'
r'\s*<a href="(?P<url>/videos/\d+)"[^>]*>'
r'.*?<img class="item_img(?:\s+[\w\-]+)*"\s+(?P<img_attrs>[^>]+)>'
r'.*?<div class="item_dur">(?P<dur>[^<]+)</div>'
r'.*?<div class="item_title">(?P<title>[^<]+)</div>',
re.IGNORECASE | re.DOTALL,
)
_IMG_SRC_RE = re.compile(r'\bsrc="([^"]+)"', re.IGNORECASE)
_IMG_DATASRC_RE = re.compile(r'\bdata-src="([^"]+)"', re.IGNORECASE)
# Detail page — tags wrapper. Sometimes <div class="tags">, sometimes inline.
# Bierzemy do najbliższego </div> bo tagi tej sceny są w jednym divie.
_DETAIL_TAGS_BLOCK_RE = re.compile(
r'<div class="tags">(?P<inner>.*?)</div>', re.IGNORECASE | re.DOTALL,
)
_TAG_LINK_RE = re.compile(
r'<a\s+href="/tags/([^"]+)"[^>]*>([^<]+)</a>', re.IGNORECASE,
)
_RELEASED_RE = re.compile(r'Released:\s*(\d{4})', re.IGNORECASE)
_H1_RE = re.compile(r'<h1[^>]*>([^<]+)</h1>', re.IGNORECASE)
# Direct mp4/m3u8 sources — preferujemy 720 nad 360. Format często protocol-relative:
# `<source src="//sv.porn-xp.com/.../720.mp4">` — normalize do `https://...` w consumerze.
_SOURCE_RE = re.compile(
r'<source\s+src="(?P<url>(?:https?:)?//[^"]+\.(?:mp4|m3u8))"',
re.IGNORECASE,
)
def _parse_mmss(s: str) -> int | None:
"""`16:12` → 972, `1:20:37` → 4837. None gdy format niepoprawny."""
parts = s.strip().split(":")
try:
if len(parts) == 2:
return int(parts[0]) * 60 + int(parts[1])
if len(parts) == 3:
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
except ValueError:
return None
return None
def _classify_tag(name: str) -> str:
"""Zwraca 'studio' | 'performer' | 'tag'.
Heurystyka oparta na sample analysis pornxp.ph tagów:
- Studio: zawiera `.` (`TheTeenBay.co`, `Clips4sale.tv`) LUB CamelCase concat
bez spacji (`LegalPorno`, `DirtyWivesClub`, `AnalMom`, `Clips4sale`)
- Performer: dokładnie 2 słowa Capital + Capital (`Alix Lynx`, `Reagan Foxx`)
- Tag/category: pozostałe lowercase single word LUB Cap single word
(`oral`, `Lesbians`, `Incest`, `BBC`)
Edge case: single-word studio jak "Brazzers", "Vixen" klasyfikowane jako tag.
To akceptowalne composite score scoring tags ma niższą wagę niż studio match,
więc fallback z 1+ performer match wystarczy.
"""
name = name.strip()
if not name:
return "tag"
if "." in name:
return "studio"
if " " in name:
parts = name.split()
if len(parts) == 2 and all(p[:1].isupper() for p in parts if p):
return "performer"
return "tag"
# No spaces:
# ALL-uppercase (BBC, POV, BDSM, MILF) → tag (skróty/akronimy)
if name.isupper():
return "tag"
# CamelCase mix (LegalPorno, AnalMom, DirtyWivesClub) → studio
if any(c.isupper() for c in name[1:]):
return "studio"
return "tag"
def _slugify(name: str) -> str:
"""`Alix Lynx` → `alix-lynx`. Lowercase, spaces→hyphens, alphanum only."""
return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
class PornXPScraper(BaseBrowseScraper):
sitetag = "pornxpph"
def __init__(self) -> None:
super().__init__()
# Cache listing card metadata per scene URL — populated w `_extract_scene_urls`,
# consumed w `_parse_detail`. Detail page sam nie ma `<div class="item_dur">`
# ani thumbnail URL, tylko h1+tags+sources. Cache reset per page (każde
# _extract_scene_urls override'uje).
self._listing_cache: dict[str, dict] = {}
def _listing_url(self, page: int) -> str:
# Page 1 = homepage. Pagination `?p=N` (sprawdzone 2026-05-17 chrome devtools).
if page <= 1:
return f"{_BASE}/"
return f"{_BASE}/?p={page}"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
"""Zwraca listę URL-i scen + cache'uje meta z listing card (duration, thumb,
title, data-id) w `self._listing_cache[url]`."""
self._listing_cache = {}
seen: set[str] = set()
out: list[str] = []
for m in _LISTING_CARD_RE.finditer(listing_html):
rel_url = m.group("url")
url = urljoin(_BASE, rel_url)
if url in seen:
continue
seen.add(url)
# Parse img_attrs: prefer data-src (lazy-load actual URL) nad src
# (placeholder spinner.svg dla lazy variant). Eager cards mają tylko src.
img_attrs = m.group("img_attrs") or ""
thumb = None
if (dm := _IMG_DATASRC_RE.search(img_attrs)):
thumb = dm.group(1)
elif (sm := _IMG_SRC_RE.search(img_attrs)):
src = sm.group(1)
# Skipnij placeholder spinner jeśli nie ma data-src.
if "spinner" not in src.lower():
thumb = src
if thumb and not thumb.startswith("http"):
thumb = urljoin(_BASE, thumb)
self._listing_cache[url] = {
"data_id": m.group("id"),
"preview_mp4": (
"https:" + m.group("preview")
if m.group("preview") and m.group("preview").startswith("//")
else m.group("preview")
),
"thumb": thumb,
"duration_sec": _parse_mmss(m.group("dur") or ""),
"title": m.group("title").strip(),
}
out.append(url)
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
# Listing-card meta (preferowane — detail page nie ma duration/thumb)
meta = self._listing_cache.get(scene_url, {})
data_id = meta.get("data_id")
if not data_id:
# URL nie pasuje do listingu (random suffix mismatch po pagination redo).
# Wyciągnij data-id z URL: /videos/<id>... — pierwsze 8-10 cyfr.
id_match = re.search(r"/videos/(\d{6,12})", scene_url)
data_id = id_match.group(1) if id_match else None
# Title: prefer h1 over listing card title (detail h1 jest cleaner)
title = meta.get("title") or ""
if (m := _H1_RE.search(detail_html)):
title = m.group(1).strip() or title
if not title:
return None
duration_sec = meta.get("duration_sec")
thumb = meta.get("thumb")
# Release year — `Released: 2016`. RawScene ma `release_date` (typu `date`),
# nie samo year — wpisujemy Jan 1 jako placeholder żeby resolver miał year
# signal (date proximity scoring tylko sprawdza year w composite).
release_date: date | None = None
if (m := _RELEASED_RE.search(detail_html)):
try:
year = int(m.group(1))
if 1970 <= year <= 2100:
release_date = date(year, 1, 1)
except ValueError:
pass
# Tags: tylko block <div class="tags">...</div> tej sceny (nie related).
studio: RawStudio | None = None
performers: list[RawPerformer] = []
tags: list[RawTag] = []
seen_perf_slugs: set[str] = set()
seen_tag_slugs: set[str] = set()
if (block := _DETAIL_TAGS_BLOCK_RE.search(detail_html)):
for tag_m in _TAG_LINK_RE.finditer(block.group("inner")):
url_part = tag_m.group(1)
name = tag_m.group(2).strip()
# URL-encoded space → real space. Niektóre tagi mają `%20`.
decoded_name = unquote(url_part).strip()
# Display name z anchor preferowane (czasem rożni się od URL slug).
display = name or decoded_name
kind = _classify_tag(display)
slug = _slugify(display)
if not slug:
continue
ext_id = f"{self.sitetag}:{kind}:{slug}"
if kind == "studio":
if studio is None: # pierwszy studio-tag wygrywa
studio = RawStudio(external_id=ext_id, name=display, slug=slug)
elif kind == "performer":
if slug not in seen_perf_slugs:
seen_perf_slugs.add(slug)
performers.append(RawPerformer(external_id=ext_id, name=display))
else:
if slug not in seen_tag_slugs:
seen_tag_slugs.add(slug)
tags.append(RawTag(external_id=ext_id, name=display, slug=slug))
# Playback: direct mp4 streams `<source src="//sv.porn-xp.com/.../720.mp4">`.
# URL-e są protocol-relative — normalize do `https:`. Preferujemy 720 nad 360.
def _norm(u: str) -> str:
return "https:" + u if u.startswith("//") else u
stream_url: str | None = None
all_sources = [_norm(m.group("url")) for m in _SOURCE_RE.finditer(detail_html)]
if all_sources:
for u in all_sources:
if "720" in u:
stream_url = u
break
stream_url = stream_url or all_sources[0]
# Phash z thumbnail (pornxp własny CDN — expected niski match rate, ale
# try). Reseter ścieżek do canonical odbędzie się głównie przez
# studio+performer+year+title scoring.
fingerprints: list[RawFingerprint] = []
if thumb:
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
# Normalize page_url: pornxp homepage serwuje random URL suffix per request
# (`/videos/94528971225` vs `/videos/94528971836` ten sam scene). PlaybackSource
# unique key to `(origin, page_url)` — bez normalize generujemy 3x duplikaty
# na każdym scrape run. Canonical URL = `/videos/<data_id>`.
canonical_url = (
f"{_BASE}/videos/{data_id}" if data_id else scene_url
)
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=canonical_url,
duration_sec=duration_sec,
thumbnail_url=thumb,
stream_url=stream_url,
)
]
return RawScene(
external_id=f"{self.sitetag}:{data_id}" if data_id else f"{self.sitetag}:{scene_url}",
title=title,
release_date=release_date,
duration_sec=duration_sec,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,22 @@
"""RedTube.com — direct HTML scrape search results.
Search: `https://www.redtube.com/?search=<q>&page=<n>`
Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL viewkey-only).
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class RedTubeScraper(BaseSearchScraper):
sitetag = "redtubecom"
_search_url_template = "https://www.redtube.com/?search={query}&page={page}"
_scene_url_re = re.compile(
r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
)
def _title_from_slug(self, slug):
# Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
return f"redtube:{slug}"

View file

@ -0,0 +1,183 @@
"""shyfap.net — latest-vids browse scraper.
Browse-only (nie search-driven). Sitetag `shyfapnet`. Bogata metadata na detail
page'u (meta tags + body links): title, studio, performers, tags, duration,
description, upload_date, embed_url.
Pierwszy pilot scrapera browse-mode (2026-05-12) weryfikacja czy detail-page
metadata wystarcza do canonical match >5%. Jeśli tak rozszerzamy o porn00,
fullmovies, pornxp, freshporno, 4k69, hdporn.gg.
URL patterns:
- Listing: `/videos_1/` (page 1), `/videos_1/<n>/` (page 2+)
- Scene: `/video/<slug>_v<id>/`
- Embed: `/embed/<id>` (z og:video meta)
"""
from __future__ import annotations
import re
from datetime import date, datetime
from urllib.parse import urljoin
from app.connectors.base import RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
meta_content,
)
_BASE = "https://www.shyfap.net"
_SCENE_URL_RE = re.compile(r'href="(/video/[a-z0-9\-]+_v\d+/)"', re.IGNORECASE)
_STUDIO_LINK_RE = re.compile(
r'href="/studio/([a-z0-9\-]+)_s(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_PORNSTAR_LINK_RE = re.compile(
r'href="/pornstar/([a-z0-9\-]+)_p(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
)
_TAG_LINK_RE = re.compile(
r'href="/tag/([a-z0-9\-]+)_t(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
)
# /video/<slug>_v<id>/ — id z URL używamy jako stable internal ID (np. w external_id),
# nie z meta `ya:ovs:id` żeby uniknąć rozjazdu meta vs URL.
_INTERNAL_ID_RE = re.compile(r"_v(\d+)/?$", re.IGNORECASE)
class ShyfapScraper(BaseBrowseScraper):
sitetag = "shyfapnet"
def _listing_url(self, page: int) -> str:
# page 1 → /videos_1/, page 2 → /videos_1/2/ (shyfap quirk — sufiks `_1`
# zawsze, dodatkowy `/N/` dla pagination)
if page <= 1:
return f"{_BASE}/videos_1/"
return f"{_BASE}/videos_1/{page}/"
def _extract_scene_urls(self, listing_html: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in _SCENE_URL_RE.finditer(listing_html):
rel = m.group(1)
if rel in seen:
continue
seen.add(rel)
out.append(urljoin(_BASE, rel))
return out
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
# Title from og:title (fallback do <title> regex)
title = meta_content(detail_html, property="og:title")
if not title:
m = re.search(r"<title>([^<|]+)(?:\s*[-|])", detail_html, re.IGNORECASE)
if m:
title = m.group(1).strip()
if not title:
return None
description = meta_content(detail_html, property="og:description") or meta_content(
detail_html, name="description"
)
# Duration: <meta property="video:duration" content="2436"> (seconds)
duration_sec: int | None = None
dur_str = meta_content(detail_html, property="video:duration")
if dur_str and dur_str.isdigit():
duration_sec = int(dur_str)
# Upload date: <meta property="ya:ovs:upload_date" content="2021-12-07T09:07:11+03:00">
# To upload date do shyfap, NIE prawdziwa data release sceny. Jednak lepsza niż None
# bo zwykle uploaduje się w ciągu dni od release studia → dla date_proximity w
# resolverze (window 7 dni) zwykle wystarczy do match.
release_date: date | None = None
upload_str = meta_content(detail_html, property="ya:ovs:upload_date")
if upload_str:
try:
release_date = datetime.fromisoformat(upload_str).date()
except ValueError:
pass
# Thumbnail: og:image
thumbnail_url = meta_content(detail_html, property="og:image")
# Internal ID z URL → external_id stabilny + embed URL fallback
internal_id: str | None = None
m = _INTERNAL_ID_RE.search(scene_url)
if m:
internal_id = m.group(1)
# Embed URL: og:video (zwykle /embed/<id>)
embed_url = meta_content(detail_html, property="og:video")
if not embed_url and internal_id:
embed_url = f"{_BASE}/embed/{internal_id}"
# Studio — pierwszy `/studio/<slug>_s<id>/` link na stronie
studio: RawStudio | None = None
m_studio = _STUDIO_LINK_RE.search(detail_html)
if m_studio:
slug, sid, name = m_studio.group(1), m_studio.group(2), m_studio.group(3).strip()
studio = RawStudio(
external_id=f"shyfapnet:studio:{sid}",
name=name,
slug=slug,
)
# Performers — wszyscy `/pornstar/<slug>_p<id>/` (zwykle 1-3 per scena)
performers: list[RawPerformer] = []
seen_perf: set[str] = set()
for m_p in _PORNSTAR_LINK_RE.finditer(detail_html):
slug, pid, name = m_p.group(1), m_p.group(2), m_p.group(3).strip()
if pid in seen_perf:
continue
seen_perf.add(pid)
performers.append(
RawPerformer(
external_id=f"shyfapnet:performer:{pid}",
name=name,
)
)
# Tags — wszystkie `/tag/<slug>_t<id>/` (zwykle 10-25 per scena)
tags: list[RawTag] = []
seen_tag: set[str] = set()
for m_t in _TAG_LINK_RE.finditer(detail_html):
slug, tid, name = m_t.group(1), m_t.group(2), m_t.group(3).strip()
if tid in seen_tag:
continue
seen_tag.add(tid)
tags.append(
RawTag(external_id=f"shyfapnet:tag:{tid}", name=name, slug=slug)
)
# Playback source — embed_url (mobile WebView fallback). Stream extraction
# przez app/extractors/__init__.py wymaga osobnego registry entry — dla
# pilot scrapera zostawiamy embed-only (WebView), direct mp4 to follow-up.
playback_sources = [
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
embed_url=embed_url,
duration_sec=duration_sec,
thumbnail_url=thumbnail_url,
)
]
# Perceptual hash z thumbnail. Resolver Path 3 (find_by_phash_within,
# Hamming ≤5) auto-merguje gdy TPDB/StashDB ma fingerprint tej samej sceny.
# Niezależne od shyfap title-rebrandingu — bierze się z frame'u sceny.
fingerprints: list[RawFingerprint] = []
if thumbnail_url:
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
if ph:
fingerprints.append(RawFingerprint(kind="phash", value=ph))
return RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
description=description,
duration_sec=duration_sec,
release_date=release_date,
url=scene_url,
studio=studio,
performers=performers,
tags=tags,
fingerprints=fingerprints,
playback_sources=playback_sources,
)

View file

@ -0,0 +1,19 @@
"""siska.video — direct HTML scrape.
Search: `https://siska.video/page/<n>/?s=<q>`.
Scene URL: `https://siska.video/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class SiskaScraper(BaseSearchScraper):
sitetag = "siskavideo"
_search_url_template = "https://siska.video/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://siska\.video/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,78 @@
"""SxyLandScraper — direct HTML scrape sxyland.com search.
Search: `https://sxyland.com/?s=<query>` zwraca wyniki w formacie
`https://sxyland.com/<numeric_id>/<slug>/`. Filtrujemy linki bez numeric ID
(legal pages typu /18-u-s-c-2257/).
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"')
class SxyLandScraper(BaseDirectTubeScraper):
sitetag = "sxylandcom"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://sxyland.com/page/{page}/?s={q}"
try:
r = browser_get(url, timeout=30)
except Exception as e:
log.warning("sxyland search fetch failed: %s", e)
return
if r.status_code != 200:
return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/"
slug = m.group(3)
if scene_url in seen:
continue
seen.add(scene_url)
slug_lower = slug.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
title = slug.replace("-", " ").strip()
yield RawScene(
external_id=f"sxylandcom:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:sxyland",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

View file

@ -0,0 +1,24 @@
"""sxyprn.com — direct HTML scrape search results.
Sxyprn search jest oparte na `?type=videos&query=<q>` GET endpoint który zwraca
HTML strony z linkami. Scene URL format: `https://sxyprn.com/post/<post_id>.html`.
Page'owanie sxyprn niespójne — często single-page results dla query (~24 wyników).
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class SxyPrnScraper(BaseSearchScraper):
sitetag = "sxyprncom"
_search_url_template = "https://sxyprn.com/?type=videos&query={query}&page={page}"
_scene_url_re = re.compile(
r'href="(?P<url>/post/(?P<slug>[a-z0-9]+))\.html"',
)
def _title_from_slug(self, slug: str) -> str:
# sxyprn post ID to nieczytelny hash — placeholder, title backfill przy resolve.
return f"sxyprn:{slug}"

View file

@ -0,0 +1,19 @@
"""watchporn.to — direct HTML scrape.
Search: `https://watchporn.to/page/<n>/?s=<q>` (WordPress).
Scene URL: `https://watchporn.to/videos/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class WatchPornScraper(BaseSearchScraper):
sitetag = "watchporn"
_search_url_template = "https://watchporn.to/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://watchporn\.to/videos/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,19 @@
"""XHamster.com — direct HTML scrape search results.
Search: `https://xhamster.com/search/<q>?page=<n>`
Scene URL: `https://xhamster.com/videos/<slug>-<id>`
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class XHamsterScraper(BaseSearchScraper):
sitetag = "xhamstercom"
_search_url_template = "https://xhamster.com/search/{query}?page={page}"
_scene_url_re = re.compile(
r'href="(?P<url>https://xhamster\.com/videos/(?P<slug>[a-z0-9_\-]+))"',
re.IGNORECASE,
)

View file

@ -0,0 +1,19 @@
"""xmoviesforyou.com — direct HTML scrape.
Search: WordPress `?s=<q>` (lub `/page/<n>/?s=<q>` dla pagination).
Scene URL: `https://xmoviesforyou.com/<slug>/` (single segment).
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class XMoviesForYouScraper(BaseSearchScraper):
sitetag = "xmoviesforyoucom"
_search_url_template = "https://xmoviesforyou.com/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://xmoviesforyou\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,28 @@
"""XNXX.com — direct HTML scrape search results.
Search: `https://www.xnxx.com/search/<q>/<page-1>` (xnxx 0-indexed)
Scene URL: `https://www.xnxx.com/video-<id>/<slug>`
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class XnxxScraper(BaseSearchScraper):
sitetag = "xnxxcom"
# `/<page-1>` — handle override in search() by replacing {page}.
_search_url_template = "https://www.xnxx.com/search/{query}/{page}"
_scene_url_re = re.compile(
r'href="(?P<url>/video-[a-z0-9]+/(?P<slug>[a-z0-9_\-]+))"',
re.IGNORECASE,
)
def search(self, query, *, page=1, limit=None):
original = self._search_url_template
self._search_url_template = original.replace("{page}", str(page - 1))
try:
yield from super().search(query, page=page, limit=limit)
finally:
self._search_url_template = original

View file

@ -0,0 +1,33 @@
"""XVideos.com — direct HTML scrape search results.
Search: `https://www.xvideos.com/?k=<q>&p=<page-1>` (xvideos używa 0-indexed pages)
Scene URL: `https://www.xvideos.com/video<digits>/<slug>`
"""
from __future__ import annotations
import re
import urllib.parse
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class XVideosScraper(BaseSearchScraper):
sitetag = "xvideoscom"
# 0-indexed page — w base classie computed jako `page=N`, więc override _build_url.
_search_url_template = "https://www.xvideos.com/?k={query}&p={page}"
_scene_url_re = re.compile(
r'href="(?P<url>/video[a-z0-9.\-]+/(?P<slug>[a-z0-9_\-]+))"',
re.IGNORECASE,
)
def search(self, query, *, page=1, limit=None):
# XVideos używa 0-indexed pages — `page=1` w API → `&p=0` w URL.
# Override żeby base class fetch'nął zewnętrzny URL z (page-1).
# Najprościej: dostosujmy URL w override przed wywołaniem super().search().
# Ale super() używa self._search_url_template — robimy clone z poprawionym page.
original = self._search_url_template
self._search_url_template = original.replace("{page}", str(page - 1))
try:
yield from super().search(query, page=page, limit=limit)
finally:
self._search_url_template = original

View file

@ -0,0 +1,21 @@
"""xxxfree.watch — direct HTML scrape.
Domain: `xxxfree.watch` (sitetag `xxxfreewatch` is legacy from porn-app DEFAULT_SITETAGS).
Search: `https://xxxfree.watch/page/<n>/?s=<q>`.
Scene URL: `https://xxxfree.watch/<slug>/`.
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class XxxFreeWatchScraper(BaseSearchScraper):
sitetag = "xxxfreewatch"
_search_url_template = "https://xxxfree.watch/page/{page}/?s={query}"
_scene_url_re = re.compile(
r'href="(?P<url>https://xxxfree\.watch/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
re.IGNORECASE,
)

View file

@ -0,0 +1,22 @@
"""YouPorn.com — direct HTML scrape search results.
Search: `https://www.youporn.com/search/?query=<q>&page=<n>`
Scene URL: `https://www.youporn.com/watch/<id>/<slug>/`
"""
from __future__ import annotations
import re
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
class YouPornScraper(BaseSearchScraper):
sitetag = "youporncom"
_search_url_template = "https://www.youporn.com/search/?query={query}&page={page}"
_scene_url_re = re.compile(
r'href="(?P<url>/watch/(?P<id>\d+)/(?P<slug>[a-z0-9_\-]+))/?"',
re.IGNORECASE,
)
def _slug_from_match(self, m, scene_url):
return m.group("slug")

View file

@ -0,0 +1,119 @@
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
"""
from __future__ import annotations
import logging
import re
import urllib.parse
from collections.abc import Iterator
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
_SCENE_URL_RE = re.compile(
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
)
_OG_TITLE_RE = re.compile(
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
)
_OG_IMAGE_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
nie siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
miniaturkę z poprawnym wymiarem (200x200 mała, ale lepsza niż żadna).
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
bez thumbnail_url czyli z dwoma najsłabszymi sygnałami na raz, co
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
2026-05-09).
"""
try:
r = browser_get(scene_url, timeout=20)
except Exception as e:
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
return None, None
if r.status_code != 200:
return None, None
title = None
thumb = None
if (m := _OG_TITLE_RE.search(r.text)):
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
title = m.group(1).split("|")[0].strip()
if (m := _OG_IMAGE_RE.search(r.text)):
thumb = m.group(1).strip()
return title, thumb
class ZeroDayXXScraper(BaseDirectTubeScraper):
sitetag = "0dayxxcom"
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
q = urllib.parse.quote_plus(query.strip())
url = f"https://0dayxx.com/page/{page}/?s={q}"
try:
r = browser_get(url, timeout=30)
except Exception as e:
log.warning("0dayxx search fetch failed: %s", e)
return
if r.status_code != 200:
return
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
seen: set[str] = set()
yielded = 0
for m in _SCENE_URL_RE.finditer(r.text):
scene_url = m.group(1) + "/"
slug = m.group(2)
if scene_url in seen:
continue
seen.add(scene_url)
slug_lower = slug.lower()
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
continue
real_title, thumb = _fetch_detail(scene_url)
title = real_title or slug.replace("-", " ").strip()
yield RawScene(
external_id=f"0dayxxcom:{scene_url}",
title=title,
url=scene_url,
playback_sources=[
RawPlaybackSource(
origin="tube:0dayxxcom",
page_url=scene_url,
thumbnail_url=thumb,
)
],
performers=[RawPerformer(name=query.strip())],
raw={
"source": "direct_scraper:0dayxx",
"query": query,
"page": page,
"url": scene_url,
},
)
yielded += 1
if limit is not None and yielded >= limit:
return

466
app/connectors/dooplay.py Normal file
View file

@ -0,0 +1,466 @@
"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies.
Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin),
więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje.
Listing: `/movies/page/N/` zwraca <a href="/movies/<slug>/"> per item.
Detail: `/movies/<slug>/` ma rich meta:
- <h1> tytuł (w class="data" wrapper)
- <a href="/year/YYYY/" rel="tag"> rok produkcji
- <a href="/studios/<slug>/" rel="tag"> studio
- <span class='duration'>NN mins.</span> długość
- <a href="/pornstar/<slug>/"> cast (multi)
- <a href="/genre/<slug>/"> tagi (multi)
- <div itemprop="description"><p>...</p></div> opis
- <span class="dt_rating_vgs" itemprop="ratingValue">N</span> rating 0-10
- <li ... data-fl-source="<embed_url>"><a href="<embed_link>">Host</a></li> player options
Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) każdy embed
URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile
mógł wybrać czyim embedem chce odpalić scenę.
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.extractors import browser_get
from app.models.source import SourceKind
log = logging.getLogger(__name__)
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
# ---- selektory (theme-agnostic — działa dla dowolnego dooplay) -----------
# Listing item — dwa wzorce w zależności od witryny:
# 1. mangoporn: zwykłe <a href="https://site/movies/<slug>/"> bez klasy
# (theme wyrendurował SEO-friendly URL bezpośrednio w grid)
# 2. streamporn/pandamovies: <a class="ml-mask jt" href="<base>/<slug>/">
# (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/)
# Łapiemy oba przez alternatywę.
_LIST_ITEM_RE = re.compile(
r'<a\s+href="(?P<url>https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"'
r"|"
r'<a\s+href="(?P<url2>https?://[^"]+/movies/[a-z0-9-]+/)"',
re.IGNORECASE,
)
# Tolerantny title — mangoporn (dooplay) używa <h1> w class="data", streamporn/pandamovies
# (raw PsyPlay theme) używają <h3 itemprop="name">. Łapiemy oba przez itemprop="name".
_TITLE_RE = re.compile(
r'<h[1-6][^>]*\sitemprop="name"[^>]*>([^<]+)</h[1-6]>'
r'|class="data"[^>]*>\s*<h[1-6][^>]*>([^<]+)</h[1-6]>',
re.IGNORECASE | re.DOTALL,
)
# dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów —
# różne thema dziedziczą podstawowy markup ale customizują URL słowniki.
_YEAR_RE = re.compile(
r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE
)
_STUDIO_RE = re.compile(
r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Duration: span class='duration' (dooplay) lub <p><strong>Duration:</strong> X hrs. Y mins.</p> (PsyPlay)
_DURATION_SPAN_RE = re.compile(
r"<span\s+class=['\"]duration['\"][^>]*>([^<]+)</span>", re.IGNORECASE
)
_DURATION_TEXT_RE = re.compile(
r"<strong>\s*Duration:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
# Release date: span class='release_date' (dooplay) lub <p><strong>Released Date:</strong> X</p> (PsyPlay)
_RELEASE_DATE_SPAN_RE = re.compile(
r"<span\s+class=['\"]release_date['\"]'?[^>]*>([^<]+)</span>", re.IGNORECASE
)
_RELEASE_DATE_TEXT_RE = re.compile(
r"<strong>\s*Released?\s*Date:\s*</strong>\s*([^<]+)<", re.IGNORECASE
)
_DESCRIPTION_RE = re.compile(
r'itemprop="description"[^>]*>(.*?)</div>', re.IGNORECASE | re.DOTALL
)
_RATING_RE = re.compile(
r'itemprop="ratingValue"[^>]*>([\d.]+)</span>', re.IGNORECASE
)
# Cast: dooplay /pornstar/, PsyPlay /actor/
_PORNSTAR_RE = re.compile(
r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Genre: same /genre(s)/ w obu themach
_GENRE_TAG_RE = re.compile(
r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
re.IGNORECASE,
)
# Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url
# to page URL u hostera. Stare theme (mangoporn): `<li class="hosts-buttons-wpx">`.
# Nowe theme (pandamovies od ~2026-04): `<div class="Rtable1-cell" data-fl-url=...
# data-fl-source=...>`. Trzeba też tolerować order-independent attrs — nowe theme
# emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi
# regexami i konsolidujemy w `_iter_player_options`.
_PLAYER_OPTION_RE = re.compile(
r'<li[^>]*\bclass="hosts-buttons-wpx"[^>]*'
r'(?:data-fl-source="(?P<source>[^"]*)"[^>]*)?'
r'(?:data-fl-url="(?P<page>[^"]*)"[^>]*)?'
r'>\s*<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
# Nowy markup pandamovies: `<div class="Rtable1-cell" data-fl-* ...><a href=...>HostName</a></div>`.
# Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla
# doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1)
# żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy
# window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution
# doodstream→mixdrop entry, code-review #14).
_PLAYER_OPTION_DIV_RE = re.compile(
r'(<div[^>]*\bclass="Rtable1-cell"[^>]*>)\s*'
r'<a[^>]*href="(?P<href>[^"]+)"[^>]*'
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
re.IGNORECASE | re.DOTALL,
)
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
# instalacji bez schema. Trzeci fallback: og:image meta tag.
_POSTER_JSONLD_RE = re.compile(
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'class="poster"[^>]*>\s*<img\s+[^>]*src="([^"]+)"', re.IGNORECASE
)
_POSTER_OG_RE = re.compile(
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
)
_DURATION_MINS_RE = re.compile(r"(\d+)\s*min", re.IGNORECASE)
class DooplayConnector(BaseMovieConnector):
"""Generic dooplay scraper. Instantiated per-site via subclasses below."""
kind = SourceKind.scraper
base_url: str
name: str
def __init__(self, *, timeout: float = 30.0):
if not getattr(self, "base_url", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `base_url`")
if not getattr(self, "name", None):
raise RuntimeError(f"{type(self).__name__} requires class-level `name`")
self._timeout = timeout
def close(self) -> None:
pass
def _fetch(self, url: str) -> str:
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
if not url.startswith("http"):
url = self.base_url.rstrip("/") + url
headers = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
"Referer": self.base_url + "/",
}
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
if r.status_code >= 400:
raise httpx.HTTPStatusError(
f"{r.status_code} for {url}",
request=None, # type: ignore[arg-type]
response=httpx.Response(r.status_code, text=r.text[:200]),
)
return r.text
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
seen = 0
page = 1
seen_urls: set[str] = set()
while True:
try:
urls = list(self._fetch_listing(page))
except httpx.HTTPError as e:
log.warning("%s listing page=%d failed: %s", self.name, page, e)
return
if not urls:
log.info("%s: empty page=%d, stop", self.name, page)
return
for url in urls:
if url in seen_urls:
continue
seen_urls.add(url)
try:
movie = self._fetch_detail(url)
except httpx.HTTPError as e:
log.warning("%s detail %s failed: %s", self.name, url, e)
continue
if movie is None:
continue
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
def _fetch_listing(self, page: int) -> Iterator[str]:
path = self._listing_path(page)
text = self._fetch(path)
from urllib.parse import urlparse
site_host = urlparse(self.base_url).hostname
for m in _LIST_ITEM_RE.finditer(text):
url = m.group("url") or m.group("url2")
if not url:
continue
try:
if urlparse(url).hostname != site_host:
continue
except Exception:
continue
yield url
def _listing_path(self, page: int) -> str:
return "/movies/" if page == 1 else f"/movies/page/{page}/"
def _fetch_detail(self, url: str) -> RawMovie | None:
from urllib.parse import urlparse
path = urlparse(url).path.rstrip("/")
slug = path.split("/")[-1] or "root"
text = self._fetch(url)
return _parse_dooplay_detail(
slug=slug, page_url=url, html=text,
source_name=self.name, base_url=self.base_url,
)
def _parse_dooplay_detail(
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
) -> RawMovie | None:
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("%s: no title in %s", source_name, slug)
return None
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
m_year = _YEAR_RE.search(html)
release_year = int(m_year.group(1)) if m_year else None
studio: RawStudio | None = None
m_studio = _STUDIO_RE.search(html)
if m_studio:
studio_slug = m_studio.group(1)
studio_name = _decode_html(m_studio.group(2).strip())
studio = RawStudio(
external_id=f"{source_name}:{studio_slug}",
name=studio_name,
slug=studio_slug,
)
duration_sec: int | None = None
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
if m_dur:
text = m_dur.group(1)
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
if m_h or m_m:
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
release_date: date | None = None
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
if m_rd:
text = m_rd.group(1).strip()
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
try:
release_date = datetime.strptime(text, fmt).date()
break
except ValueError:
continue
description: str | None = None
m_desc = _DESCRIPTION_RE.search(html)
if m_desc:
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
rating: float | None = None
m_rating = _RATING_RE.search(html)
if m_rating:
try:
rating = float(m_rating.group(1))
except ValueError:
pass
poster_url: str | None = None
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
m = rgx.search(html)
if m:
candidate = m.group(1).strip()
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
poster_url = candidate
break
# Performers — tylko sekcja "Pornstars" ma /pornstar/<slug>/ linki, dooplay
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
# resolverze (po performer_id).
performers = [
RawPerformer(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
)
for m in _PORNSTAR_RE.finditer(html)
]
tags = [
RawTag(
external_id=f"{source_name}:{m.group(1)}",
name=_decode_html(m.group(2).strip()),
slug=m.group(1),
)
for m in _GENRE_TAG_RE.finditer(html)
]
if page_url is None:
page_url = f"{base_url}/movies/{slug}/"
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
playback_sources: list[RawPlaybackSource] = []
seen_hrefs: set[str] = set()
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
def _emit_host_entry(href: str, source: str | None) -> None:
href = href.strip()
if not href or href in seen_hrefs:
return
seen_hrefs.add(href)
try:
from urllib.parse import urlparse
host = urlparse(href).hostname or "unknown"
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
except Exception:
host_short = "unknown"
if host_short.lower() in SKIP_HOSTERS:
return
playback_sources.append(
RawPlaybackSource(
origin=f"{source_name}:{host_short}",
page_url=href,
embed_url=source or href,
thumbnail_url=poster_url,
duration_sec=duration_sec,
)
)
# Stary `<li class="hosts-buttons-wpx">` markup (mangoporn).
for m in _PLAYER_OPTION_RE.finditer(html):
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
# Nowy `<div class="Rtable1-cell">` markup (pandamovies od ~2026-04 + nowe
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
href = m.group("href") or ""
opening_tag = m.group(1)
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
source = (src_match.group(1).strip() if src_match else "") or None
_emit_host_entry(href, source)
if not playback_sources:
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
# niepotrzebnym ad-pageiem.
playback_sources.append(
RawPlaybackSource(
origin=source_name,
page_url=page_url,
thumbnail_url=poster_url,
)
)
return RawMovie(
external_id=slug,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
duration_sec=duration_sec,
rating=rating,
poster_url=poster_url,
url=page_url,
studio=studio,
performers=performers,
tags=tags,
playback_sources=playback_sources,
raw={"slug": slug, "html_len": len(html)},
)
# ---- per-site instances ----------------------------------------------------
class StreampornConnector(DooplayConnector):
name = "streamporn"
base_url = "https://streamporn.nl"
class PandamoviesConnector(DooplayConnector):
name = "pandamovies"
base_url = "https://pandamovies.pw"
class MangopornConnector(DooplayConnector):
name = "mangoporn"
base_url = "https://mangoporn.net"
# ---------------------------------------------------------------------------
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'",
"&apos;": "'", "&nbsp;": " ", "&rsquo;": "'", "&lsquo;": "'",
"&rdquo;": '"', "&ldquo;": '"', "&hellip;": "...", "&mdash;": "", "&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s

View file

@ -0,0 +1,325 @@
"""Paradisehill connector — primary source dla movies (full-length adult films).
Site notes:
- Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie
cookie do GET-a działa site jest tolerancyjny).
- Listing: `/all/?sort=created_at&page=N` paginacja po 28 filmów, mikro-data Schema.org Movie.
- Detail: `/<hex_id>/` pełne meta + Video.js playlist (chaptery jako "Part 1/2/3").
Co ekstraktujemy:
- Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl
- Studio: link `/studio/<id>/{name}` (tylko link dostarcza nazwę i external_id)
- Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre)
- Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date
- Chapters: liczba `<li>...Part N</li>` w playliście Video.js
- Playback: na MVP `page_url` only Video.js playlist URL jest dynamicznie ładowany przez JS
i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak).
External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`).
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import UTC, date, datetime
from typing import Any
import httpx
from app.connectors.base import (
BaseMovieConnector,
RawMovie,
RawMovieChapter,
RawPerformer,
RawPlaybackSource,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
BASE_URL = "https://paradisehill.cc"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
LISTING_PATH = "/all/" # ?sort=created_at&page=N
SOURCE_NAME = "paradisehill"
# Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich
# zmianach themu (yii2 widget renderuje je inwariantnie).
_TITLE_RE = re.compile(
r'<h1\s+class="title-inside"\s+itemprop="name">([^<]+)</h1>', re.IGNORECASE
)
_DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)</', re.IGNORECASE)
_DESCRIPTION_RE = re.compile(
r'itemprop="description">([^<]+(?:<[^>]+>[^<]+)*)</span>', re.IGNORECASE | re.DOTALL
)
_DATE_PUBLISHED_RE = re.compile(
r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE
)
_POSTER_RE = re.compile(
r'<img\s+itemprop="image"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_THUMBNAIL_RE = re.compile(
r'<img\s+itemprop="thumbnailUrl"\s+src="(/images/[^"]+)"', re.IGNORECASE
)
_STUDIO_LINK_RE = re.compile(r'<a\s+href="/studio/(\d+)/"[^>]*>([^<]+)</a>', re.IGNORECASE)
_CHAPTER_RE = re.compile(
r'<a\s+href="#"\s+class="js-list-item"\s+data-index="(\d+)">([^<]+)</a>',
re.IGNORECASE,
)
# Listing page item:
_LIST_ITEM_RE = re.compile(
r'<div\s+class="item\s+list-film-item"[^>]*>\s*'
r'<a\s+href="/([0-9a-f]+)/"[^>]*>',
re.IGNORECASE,
)
# Year w description: szukamy 4-cyfrowego roku w sensownym zakresie
_YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b")
# Year w tytule (np. "Title (1999)")
_YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)")
class ParadisehillConnector(BaseMovieConnector):
kind = SourceKind.scraper
name = SOURCE_NAME
def __init__(self, *, timeout: float = 30.0):
self._client = httpx.Client(
base_url=BASE_URL,
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": USER_AGENT,
# Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate.
"Cookie": "is18=1",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
},
)
def close(self) -> None:
self._client.close()
def fetch_movies(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawMovie]:
"""Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first).
`since`: stop gdy datePublished < since. `limit`: stop po N filmach.
Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące
stron w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa.
"""
seen = 0
page = 1
while True:
try:
ids = list(self._fetch_listing_page(page))
except httpx.HTTPError as e:
log.warning("paradisehill listing page=%d failed: %s", page, e)
return
if not ids:
log.info("paradisehill: empty listing page=%d, stop", page)
return
for mid in ids:
try:
movie = self._fetch_detail(mid)
except httpx.HTTPError as e:
log.warning("paradisehill detail %s failed: %s", mid, e)
continue
if movie is None:
continue
# `since` filter — datePublished poniżej threshold = stop crawla,
# bo listing jest chronologiczny. since z `_last_successful_finished_at`
# jest TZ-aware (UTC); combine() daje naive — przywróć UTC tzinfo żeby
# porównanie nie crashowało.
if since is not None and movie.release_date is not None:
rd_dt = datetime.combine(
movie.release_date, datetime.min.time(), tzinfo=UTC
)
if rd_dt < since:
log.info(
"paradisehill: hit since boundary at %s (%s), stop",
mid, movie.release_date,
)
return
yield movie
seen += 1
if limit is not None and seen >= limit:
return
page += 1
def _fetch_listing_page(self, page: int) -> Iterator[str]:
"""Yielduje hex IDs filmów na danej stronie."""
url = f"{LISTING_PATH}?sort=created_at&page={page}"
r = self._client.get(url)
r.raise_for_status()
for m in _LIST_ITEM_RE.finditer(r.text):
yield m.group(1)
def _fetch_detail(self, hex_id: str) -> RawMovie | None:
url = f"/{hex_id}/"
r = self._client.get(url)
r.raise_for_status()
return _parse_detail(hex_id, r.text)
def _parse_detail(hex_id: str, html: str) -> RawMovie | None:
"""Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template)."""
m_title = _TITLE_RE.search(html)
if not m_title:
log.warning("paradisehill: no title in detail %s", hex_id)
return None
title = _decode_html(m_title.group(1).strip())
m_director = _DIRECTOR_RE.search(html)
director = _decode_html(m_director.group(1).strip()) if m_director else None
if director and director.lower() in ("unknown", "n/a", "-"):
director = None
m_desc = _DESCRIPTION_RE.search(html)
description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None
release_date: date | None = None
m_date = _DATE_PUBLISHED_RE.search(html)
if m_date:
try:
release_date = datetime.fromisoformat(m_date.group(1)).date()
except ValueError:
pass
# Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill
# (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering.
release_year: int | None = None
m_yt = _YEAR_IN_TITLE_RE.search(title)
if m_yt:
release_year = int(m_yt.group(1))
elif description:
m_yd = _YEAR_IN_DESC_RE.search(description)
if m_yd:
release_year = int(m_yd.group(1))
poster_url: str | None = None
m_poster = _POSTER_RE.search(html)
if m_poster:
poster_url = BASE_URL + m_poster.group(1)
backdrop_url: str | None = None
m_thumb = _THUMBNAIL_RE.search(html)
if m_thumb:
backdrop_url = BASE_URL + m_thumb.group(1)
studio: RawStudio | None = None
m_studio = _STUDIO_LINK_RE.search(html)
if m_studio:
studio = RawStudio(
external_id=f"paradisehill:{m_studio.group(1)}",
name=_decode_html(m_studio.group(2).strip()),
)
# Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations).
# Recommended films też mają itemprop="genre" więc match limity do block-inside.
tags: list[RawTag] = []
block_match = re.search(
r'<div\s+class="block-inside"[^>]*itemtype="http://schema\.org/Movie"[^>]*>'
r'(.*?)</div>\s*</div>\s*<div\s+class="similar',
html,
re.DOTALL,
)
block = block_match.group(1) if block_match else html[:8000]
for m_genre in re.finditer(r'itemprop="genre"[^>]*>([^<]+)</', block, re.IGNORECASE):
name = _decode_html(m_genre.group(1).strip())
if name and len(tags) < 10:
tags.append(RawTag(name=name, slug=_slugify(name)))
chapters: list[RawMovieChapter] = []
for m_ch in _CHAPTER_RE.finditer(html):
chapters.append(
RawMovieChapter(
chapter_index=int(m_ch.group(1)),
title=_decode_html(m_ch.group(2).strip()),
)
)
page_url = f"{BASE_URL}/{hex_id}/"
playback_sources = [
RawPlaybackSource(
origin=SOURCE_NAME,
page_url=page_url,
thumbnail_url=poster_url,
)
]
return RawMovie(
external_id=hex_id,
title=title,
description=description,
release_year=release_year,
release_date=release_date,
director=director,
poster_url=poster_url,
backdrop_url=backdrop_url,
url=page_url,
studio=studio,
performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory.
tags=tags,
chapters=chapters,
playback_sources=playback_sources,
raw={"hex_id": hex_id, "html_len": len(html)},
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_TAG_RE = re.compile(r"<[^>]+>")
def _strip_tags(s: str) -> str:
return _TAG_RE.sub("", s)
_HTML_ENTITIES = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&#39;": "'",
"&apos;": "'",
"&nbsp;": " ",
"&rsquo;": "'",
"&lsquo;": "'",
"&rdquo;": '"',
"&ldquo;": '"',
"&hellip;": "...",
"&mdash;": "",
"&ndash;": "",
}
def _decode_html(s: str) -> str:
for k, v in _HTML_ENTITIES.items():
s = s.replace(k, v)
# Numeric entities
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
return s
_SLUG_RE = re.compile(r"[^a-z0-9]+")
def _slugify(s: str) -> str:
return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"

405
app/connectors/stashdb.py Normal file
View file

@ -0,0 +1,405 @@
"""StashDB GraphQL connector.
Endpoint: https://stashdb.org/graphql (auth: header `ApiKey: <key>`)
Query używamy `queryScenes(input: {sort, direction, page, per_page})`. StashDB nie udostępnia
typowego date-since filtra w SceneQueryInput, więc deltę robimy klient-side: sortujemy po
UPDATED_AT DESC i przerywamy gdy `updated < since`.
Schema fields kluczowe (wg https://github.com/stashapp/stash-box/blob/master/graphql/schema/schema.graphql):
Scene { id title details date duration director code urls{url site{name}}
studio{id name parent{id name}}
performers{ as performer{ id name aliases gender birthdate{date} country } }
tags{ id name }
fingerprints{ hash algorithm duration } }
Cross-reference do TPDB: `urls[].site.name` zwykle zawiera "ThePornDB" + URL z UUID
(format: https://theporndb.net/scenes/<uuid>). Wyciągamy ten UUID jako tpdb cross-ref;
ingest_orchestrator może go potem użyć do path 2 (cross-source UUID).
"""
from __future__ import annotations
import logging
import re
from collections.abc import Iterator
from datetime import UTC, date, datetime
from typing import Any
import httpx
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from app.config import get_settings
from app.connectors.base import (
BaseConnector,
RawFingerprint,
RawPerformer,
RawScene,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
SCENES_QUERY = """
query QScenes($input: SceneQueryInput!) {
queryScenes(input: $input) {
count
scenes {
id
title
details
release_date
date
duration
director
code
updated
urls { url site { name } }
studio {
id name
parent { id name }
}
performers {
as
performer {
id
name
aliases
gender
birthdate { date }
country
}
}
tags { id name }
fingerprints { hash algorithm duration }
}
}
}
"""
# UUID v4-ish pattern (relaxed)
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I)
class StashDBConnector(BaseConnector):
kind = SourceKind.stashdb
name = "stashdb"
def __init__(
self,
*,
api_key: str | None = None,
url: str | None = None,
per_page: int = 100,
timeout: float = 30.0,
) -> None:
settings = get_settings()
self.api_key = api_key or settings.stashdb_api_key
if not self.api_key:
raise RuntimeError("STASHDB_API_KEY is not set")
self.url = url or settings.stashdb_graphql_url
self.per_page = per_page
self.timeout = timeout
def _client(self) -> httpx.Client:
return httpx.Client(
headers={
"ApiKey": self.api_key,
"Accept": "application/json",
"Content-Type": "application/json",
"User-Agent": "goon/0.1",
},
timeout=self.timeout,
)
@retry(
retry=retry_if_exception_type((httpx.TransportError, httpx.HTTPStatusError)),
wait=wait_exponential(multiplier=1, min=2, max=30),
stop=stop_after_attempt(5),
reraise=True,
)
def _post(self, client: httpx.Client, payload: dict[str, Any]) -> dict[str, Any]:
resp = client.post(self.url, json=payload)
if resp.status_code == 429:
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
resp.raise_for_status()
body = resp.json()
if errors := body.get("errors"):
raise RuntimeError(f"stashdb graphql errors: {errors}")
return body["data"]
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
yield from self._paginate(
extra_input={"sort": "UPDATED_AT", "direction": "DESC"},
since=since,
limit=limit,
)
def find_performer_id_by_name(self, name: str) -> str | None:
"""queryPerformers(input: {name: <name>}) → pierwszy result.
StashDB GraphQL `name` to filter substring (case-insensitive). Zwracamy id
performera o exact match (case-insensitive) jeśli jest, inaczej pierwszy z listy.
"""
query = (
"query QPerformers($input: PerformerQueryInput!) {"
" queryPerformers(input: $input) { performers { id name } }"
"}"
)
variables = {"input": {"name": name, "per_page": 5}}
with self._client() as client:
try:
data = self._post(client, {"query": query, "variables": variables})
except Exception as e:
log.warning("stashdb queryPerformers name=%s failed: %s", name, e)
return None
performers = (data.get("queryPerformers") or {}).get("performers") or []
if not performers:
return None
target = name.strip().lower()
for p in performers:
if (p.get("name") or "").strip().lower() == target:
return p.get("id")
return performers[0].get("id")
def fetch_scenes_for_performer(
self,
performer_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Wszystkie sceny StashDB dla performera o podanym kanonicznym UUID.
StashDB SceneQueryInput.performers = MultiIDCriterionInput { value, modifier }.
Modifier INCLUDES = scena ma WSZYSTKIE wymienione UUID-y; przy 1 UUID = po prostu
sceny tego performera.
"""
yield from self._paginate(
extra_input={
"performers": {
"value": [performer_external_id],
"modifier": "INCLUDES",
},
"sort": "DATE",
"direction": "DESC",
},
since=None, # przy performer-scoped pull bierzemy całą historię
limit=limit,
)
def fetch_scenes_for_studio(
self,
studio_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Wszystkie sceny StashDB dla studio o podanym kanonicznym UUID.
Analogiczne do fetch_scenes_for_performer ale `studios` zamiast `performers`.
StashDB SceneQueryInput.studios = MultiIDCriterionInput { value, modifier }.
"""
yield from self._paginate(
extra_input={
"studios": {
"value": [studio_external_id],
"modifier": "INCLUDES",
},
"sort": "DATE",
"direction": "DESC",
},
since=None,
limit=limit,
)
def _paginate(
self,
*,
extra_input: dict[str, Any],
since: datetime | None,
limit: int | None,
) -> Iterator[RawScene]:
emitted = 0
page = 1
with self._client() as client:
while True:
variables = {
"input": {
"page": page,
"per_page": self.per_page,
**extra_input,
}
}
data = self._post(client, {"query": SCENES_QUERY, "variables": variables})
payload = data.get("queryScenes") or {}
scenes = payload.get("scenes") or []
if not scenes:
return
for raw in scenes:
if since is not None and _updated_before(raw, since):
return
parsed = _parse_scene(raw)
if parsed is None:
continue
yield parsed
emitted += 1
if limit is not None and emitted >= limit:
return
if len(scenes) < self.per_page:
return
page += 1
def _updated_before(raw: dict[str, Any], since: datetime) -> bool:
upd = raw.get("updated")
if not upd:
return False
try:
ts = datetime.fromisoformat(upd.replace("Z", "+00:00"))
except ValueError:
return False
if ts.tzinfo is None:
ts = ts.replace(tzinfo=UTC)
return ts < since
def _parse_date(value: Any) -> date | None:
if not value:
return None
if isinstance(value, date):
return value
text = str(value).strip()
if not text:
return None
try:
return date.fromisoformat(text[:10])
except ValueError:
return None
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
if not raw:
return None
parent = raw.get("parent") or {}
return RawStudio(
external_id=raw.get("id"),
name=raw.get("name") or "Unknown",
slug=None,
parent_external_id=parent.get("id"),
parent_name=parent.get("name"),
)
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
perf = raw.get("performer") or {}
name = perf.get("name")
if not name:
return None
aliases = perf.get("aliases") or []
if isinstance(aliases, str):
aliases = [a.strip() for a in aliases.split(",") if a.strip()]
bd_obj = perf.get("birthdate") or {}
bd = bd_obj.get("date") if isinstance(bd_obj, dict) else None
return RawPerformer(
external_id=perf.get("id"),
name=name,
aliases=[a for a in aliases if isinstance(a, str)],
gender=(perf.get("gender") or "").lower() or None,
birth_date=_parse_date(bd),
country=perf.get("country"),
as_alias_in_scene=raw.get("as") if raw.get("as") and raw.get("as") != name else None,
)
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
name = raw.get("name")
if not name:
return None
return RawTag(external_id=raw.get("id"), name=name, slug=None)
def _parse_fingerprint(raw: dict[str, Any]) -> RawFingerprint | None:
h = raw.get("hash")
algo = (raw.get("algorithm") or "").lower()
if not h or algo not in {"phash", "oshash", "md5"}:
return None
return RawFingerprint(kind=algo, value=h)
def _extract_cross_refs(urls: list[dict[str, Any]] | None) -> dict[str, str]:
"""Z `scene.urls` wyciąga znane cross-source ID-ki, np. tpdb_id.
Returns: dict[source_name, external_id]. Source name ma być stabilne
(lower, np. 'tpdb' / 'theporndb').
"""
out: dict[str, str] = {}
for u in urls or []:
url = u.get("url") or ""
site_name = ((u.get("site") or {}).get("name") or "").strip().lower()
if not url:
continue
# ThePornDB: .../scenes/<uuid>
if "theporndb" in site_name or "porndb" in url.lower():
m = _UUID_RE.search(url)
if m:
out["tpdb"] = m.group(0)
return out
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
external_id = raw.get("id")
title = raw.get("title")
if not external_id or not title:
log.warning("stashdb scene without id/title — skipping")
return None
performers = []
for p in raw.get("performers") or []:
parsed = _parse_performer(p)
if parsed is not None:
performers.append(parsed)
tags = []
for t in raw.get("tags") or []:
parsed_t = _parse_tag(t)
if parsed_t is not None:
tags.append(parsed_t)
fingerprints = []
for fp in raw.get("fingerprints") or []:
parsed_fp = _parse_fingerprint(fp)
if parsed_fp is not None:
fingerprints.append(parsed_fp)
cross_refs = _extract_cross_refs(raw.get("urls"))
rel = _parse_date(raw.get("release_date") or raw.get("date"))
return RawScene(
external_id=str(external_id),
title=title,
description=raw.get("details"),
release_date=rel,
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
code=raw.get("code"),
director=raw.get("director"),
url=None,
studio=_parse_studio(raw.get("studio")),
performers=performers,
tags=tags,
fingerprints=fingerprints,
cross_source_refs=cross_refs,
raw=raw,
)

329
app/connectors/tpdb.py Normal file
View file

@ -0,0 +1,329 @@
"""ThePornDB REST connector.
API: https://api.theporndb.net (auth: Bearer token)
Lista scen: GET /scenes?per_page=200&page=N&date={YYYY-MM-DD} (delta filter)
Format: {data: [...], meta: {current_page, last_page, per_page, total}}
Sceny TPDB zwracają już rozwiniętych performerów (`performers[]`), studio (`site`) i tagi (`tags[]`).
W związku z tym pojedyncze GET /scenes wystarcza do MVP nie musimy uderzać oddzielnie po performera.
Format performera w scenie:
- performer.id ID przypisania performerscene (NIE używać do dedup)
- performer.name imię w tej konkretnej scenie (może być alias, np. Mia M.")
- performer.parent.id kanoniczne UUID performerki w TPDB external_id
- performer.parent.name / .extra.gender / .extra.birthday kanoniczne metadane
Format studia: scene.site = {id, name, slug, parent: {...}, network: {...}}
"""
from __future__ import annotations
import logging
from collections.abc import Iterator
from datetime import date, datetime
from typing import Any
import httpx
from tenacity import (
retry,
retry_if_exception,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
def _is_retryable_http_error(exc: BaseException) -> bool:
"""Retry transport errors + 5xx + 429; NIE retry 4xx (404/422 = permanent).
401/403 NIE retryowalne tutaj TPDB token expiry musiałby być
obsłużony jako auth refresh (TODO gdyby zaczęły się pojawiać). Aktualnie
expire'a się raz na rok, więc nie warto kombinować.
"""
if isinstance(exc, httpx.TransportError):
return True
if isinstance(exc, httpx.HTTPStatusError):
sc = exc.response.status_code
return sc == 429 or sc >= 500
return False
from app.config import get_settings
from app.connectors.base import (
BaseConnector,
RawPerformer,
RawScene,
RawStudio,
RawTag,
)
from app.models.source import SourceKind
log = logging.getLogger(__name__)
class TPDBConnector(BaseConnector):
kind = SourceKind.tpdb
name = "tpdb"
def __init__(
self,
*,
token: str | None = None,
base_url: str | None = None,
per_page: int = 100,
timeout: float = 30.0,
) -> None:
settings = get_settings()
self.token = token or settings.tpdb_api_token
if not self.token:
raise RuntimeError("TPDB_API_TOKEN is not set")
self.base_url = (base_url or settings.tpdb_base_url).rstrip("/")
self.per_page = per_page
self.timeout = timeout
def _client(self) -> httpx.Client:
return httpx.Client(
base_url=self.base_url,
headers={
"Authorization": f"Bearer {self.token}",
"Accept": "application/json",
"User-Agent": "goon/0.1",
},
timeout=self.timeout,
)
@retry(
retry=retry_if_exception(_is_retryable_http_error),
wait=wait_exponential(multiplier=1, min=2, max=30),
stop=stop_after_attempt(5),
reraise=True,
)
def _get(self, client: httpx.Client, path: str, params: dict[str, Any]) -> dict[str, Any]:
resp = client.get(path, params=params)
if resp.status_code == 429:
# let tenacity retry — but raise something it knows
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
resp.raise_for_status()
return resp.json()
def fetch_scenes(
self,
*,
since: datetime | None = None,
limit: int | None = None,
) -> Iterator[RawScene]:
params: dict[str, Any] = {"per_page": self.per_page}
if since is not None:
params["date"] = since.date().isoformat()
yield from self._paginate_scenes(params, limit=limit)
def fetch_scenes_for_performer(
self,
performer_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Pobiera wszystkie sceny TPDB dla performera o podanym kanonicznym ID.
TPDB API: GET /performers/<id>/scenes dedykowany endpoint.
(Inne warianty broken: /scenes?performers[]=<uuid> zwraca zawsze total=0,
/scenes?performer_id=<uuid> 422.)
404 = performer usunięty z TPDB (np. b959ccbb 2026-05-16 Sentry GOON-N).
Wcześniej leciało raise exception bąbelek do scheduler.performer_driven
cały run failed. Teraz warn + yield empty caller widzi 0 scen i
kontynuuje z następnym performer.
"""
try:
yield from self._paginate_scenes(
{"per_page": self.per_page},
limit=limit,
path=f"/performers/{performer_external_id}/scenes",
)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
log.warning(
"tpdb performer %s removed (404) — skipping",
performer_external_id,
)
return
raise
def fetch_scenes_for_site(
self,
site_external_id: str,
*,
limit: int | None = None,
) -> Iterator[RawScene]:
"""Pobiera wszystkie sceny TPDB dla site/studio o podanym ID.
TPDB API: GET /sites/<id>/scenes dedykowany endpoint analogiczny
do /performers/<id>/scenes. Bez paginacji limit zwraca total scenes
z meta.total (Brazzers=272, Naughty America=631 w czasie pisania).
404 = site usunięty z TPDB analogicznie do fetch_scenes_for_performer.
"""
try:
yield from self._paginate_scenes(
{"per_page": self.per_page},
limit=limit,
path=f"/sites/{site_external_id}/scenes",
)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
log.warning(
"tpdb site %s removed (404) — skipping",
site_external_id,
)
return
raise
def find_performer_id_by_name(self, name: str) -> str | None:
"""GET /performers?q=<name> → pierwszy match. None gdy brak."""
with self._client() as client:
try:
payload = self._get(client, "/performers", {"q": name, "per_page": 5})
except httpx.HTTPStatusError as e:
log.warning("tpdb /performers q=%s failed: %s", name, e)
return None
data = payload.get("data") or []
if not data:
return None
for item in data:
# exact (case-insensitive) match preferowany; fallback do pierwszego
if (item.get("name") or "").strip().lower() == name.strip().lower():
return str(item.get("id")) if item.get("id") else None
first = data[0]
return str(first.get("id")) if first.get("id") else None
def _paginate_scenes(
self,
params: dict[str, Any],
*,
limit: int | None,
path: str = "/scenes",
) -> Iterator[RawScene]:
emitted = 0
page = 1
with self._client() as client:
while True:
params["page"] = page
payload = self._get(client, path, params)
data = payload.get("data") or []
if not data:
return
for raw in data:
scene = _parse_scene(raw)
if scene is None:
continue
yield scene
emitted += 1
if limit is not None and emitted >= limit:
return
meta = payload.get("meta") or {}
last_page = meta.get("last_page") or page
if page >= last_page:
return
page += 1
def _parse_date(value: Any) -> date | None:
if not value:
return None
if isinstance(value, date):
return value
text = str(value).strip()
if not text:
return None
# TPDB dates: "YYYY-MM-DD" lub ISO datetime
try:
return date.fromisoformat(text[:10])
except ValueError:
return None
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
if not raw:
return None
parent = raw.get("parent") or {}
network = raw.get("network") or {}
return RawStudio(
external_id=str(raw["id"]) if raw.get("id") is not None else None,
name=raw.get("name") or "Unknown",
slug=raw.get("short_name") or raw.get("slug"),
parent_external_id=str(parent["id"]) if parent.get("id") is not None else None,
parent_name=parent.get("name"),
network=network.get("name") if isinstance(network, dict) else None,
homepage_url=raw.get("url") or raw.get("home"),
)
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
parent = raw.get("parent") or {}
extra = parent.get("extras") or parent.get("extra") or {}
canonical_id = parent.get("id") or raw.get("id")
canonical_name = parent.get("name") or raw.get("name")
if not canonical_name:
return None
aliases_field = parent.get("aliases") or extra.get("aliases") or []
if isinstance(aliases_field, str):
aliases = [a.strip() for a in aliases_field.split(",") if a.strip()]
else:
aliases = [a for a in aliases_field if isinstance(a, str)]
return RawPerformer(
external_id=str(canonical_id) if canonical_id is not None else None,
name=canonical_name,
aliases=aliases,
gender=(extra.get("gender") or parent.get("gender") or "").lower() or None,
birth_date=_parse_date(extra.get("birthday")),
country=extra.get("birthplace") or extra.get("country"),
as_alias_in_scene=raw.get("name") if raw.get("name") != canonical_name else None,
)
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
name = raw.get("name")
if not name:
return None
return RawTag(
external_id=str(raw["id"]) if raw.get("id") is not None else None,
name=name,
slug=raw.get("slug"),
)
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
external_id = raw.get("id")
title = raw.get("title")
if not external_id or not title:
log.warning("tpdb scene without id/title — skipping (keys=%s)", list(raw)[:8])
return None
performers: list[RawPerformer] = []
for p in raw.get("performers") or []:
parsed = _parse_performer(p)
if parsed is not None:
performers.append(parsed)
tags: list[RawTag] = []
for t in raw.get("tags") or []:
parsed_t = _parse_tag(t)
if parsed_t is not None:
tags.append(parsed_t)
return RawScene(
external_id=str(external_id),
title=title,
description=raw.get("description"),
release_date=_parse_date(raw.get("date")),
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
code=raw.get("external_id"),
director=raw.get("director"),
url=raw.get("url"),
studio=_parse_studio(raw.get("site")),
performers=performers,
tags=tags,
fingerprints=[], # TPDB nie publikuje pHashy w głównym endpoint
raw=raw,
)

35
app/db.py Normal file
View file

@ -0,0 +1,35 @@
from collections.abc import Iterator
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker
from app.config import get_settings
_settings = get_settings()
engine = create_engine(
_settings.database_url,
pool_pre_ping=True,
future=True,
)
SessionLocal = sessionmaker(bind=engine, autoflush=False, expire_on_commit=False, future=True)
@contextmanager
def session_scope() -> Iterator[Session]:
session = SessionLocal()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
def get_session() -> Iterator[Session]:
with session_scope() as session:
yield session

157
app/extractors/__init__.py Normal file
View file

@ -0,0 +1,157 @@
"""Stream URL extractors per-tube.
Public API:
- `try_extract(sitetag, page_url) -> list[StreamSource] | None`
- `StreamSource` (dataclass)
- `HosterDead` (exception)
- `extract_stream_from_hoster(iframe_url, *, referer)` generic packer-based hoster extract
- `fetch_tube_html(url)` Chrome TLS fingerprint fetch (curl_cffi)
- `browser_get(url)` low-level
Architektura: każdy tube ma osobny moduł `app.extractors.tubes.<tube>` który eksportuje
`extract(page_url) -> list[StreamSource] | None`. Registry niżej mapuje sitetag
modułowy extractor. `try_extract()` to thin wrapper z exception handlingiem.
Po removalu porn-app dependency, ten moduł jest jedynym mechanizmem rozwiązywania
streamów playback.py nie wpada już do porn-app /stream API.
"""
from __future__ import annotations
import logging
from collections.abc import Callable
from app.extractors._fetch import browser_get, fetch_tube_html
from app.extractors._models import HosterDead, StreamSource, TubePageError
from app.extractors.hoster import extract_stream_from_hoster, unpack_packer
from app.extractors.tubes import (
_embed_iframe,
_vps_blocked_fallback,
_ytdlp,
eporner,
freshporno,
hqporner,
latestpornvideo,
paradisehill,
porn00,
pornhat,
pornxp,
sxyprn,
)
log = logging.getLogger(__name__)
# Sitetag → extractor function. Sitetag pasuje do format'u z origin: `pornapp:<sitetag>`
# (lub po Fazie 2 migracji: `tube:<sitetag>`).
#
# Mainstream tubes (pornhub/xvideos/xnxx/xhamster/redtube/youporn/porntrex) używają
# yt-dlp jako extractor — battle-tested, aktualizowane przez upstream przy zmianach
# HTML. Aggregator tubes (xmoviesforyou/watchporn/siska/...) używają generic
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
# Custom (zoptymalizowane / niestandardowy player)
# hqporner — CDN URL (bigcdn.cc, video.flyflv.com z `ip=` parametrem) IP-bound do
# requestera. VPS resolve daje 200 ale mobile direct = 404/403. Switch na WebView
# fallback: mobile pobiera embed iframe (mydaddy.cc/hqwo.cc) z phone IP, FluidPlayer
# JS decoduje mp4 URL z mobile session. Plus INJECTED_JS skanuje `<source>.src`.
# ~32k scen (drugi po porntrex największy single saving). Verified 2026-05-18.
"hqpornercom": _vps_blocked_fallback.extract,
"epornercom": eporner.extract,
"sxyprncom": sxyprn.extract,
# Mainstream tubes — yt-dlp
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
# playback.py daje mobile direct fetch, zero VPS bandwidth.
"pornhubcom": _ytdlp.extract,
"redtubecom": _ytdlp.extract,
"xvideoscom": _ytdlp.extract,
"xnxxcom": _ytdlp.extract,
"youporncom": _ytdlp.extract,
# porntrex KVS get_file — `kt_ips=<vps_ip>` cookie + single-use token (410 po reuse).
# CDN IP-bound do VPS, mobile direct = 403. Switch na _vps_blocked_fallback:
# mobile WebView z phone IP → KVS player JS dekoduje video.src → INJECTED_JS scrape.
# 137k scen oszczędzone z VPS bandwidth (largest single saving).
"porntrexcom": _vps_blocked_fallback.extract,
# VPS-blocked tubes — KVS / Cloudflare blokuje Hetzner IP, ale działają z residential
# IP (potwierdzone Chrome DevTools MCP 2026-05-15). Mobile WebView + INJECTED_JS
# (PlayerScreen.tsx:805) skanuje <video>.src + XHR — łapie URL po decode-ie player JS.
"xhamstercom": _vps_blocked_fallback.extract,
"porndittcom": _vps_blocked_fallback.extract,
"fpoxxx": _vps_blocked_fallback.extract,
"sxylandcom": _vps_blocked_fallback.extract,
# Aggregator tubes — generic embed-iframe → hoster unpacker
"latestpornvideocom": latestpornvideo.extract,
"xmoviesforyoucom": _embed_iframe.extract,
"watchporn": _embed_iframe.extract,
"siskavideo": _embed_iframe.extract,
"porn4dayspw": _embed_iframe.extract,
"porndishcom": _embed_iframe.extract,
# xxxfreewatch — DELISTED 2026-05-18. 790 solo-orphan scen, 0% match, CF-walled z VPS.
"latestleaksco": _embed_iframe.extract,
"mypornerleakcom": _embed_iframe.extract,
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
# follow_redirects=True wymagane (fix w stream_proxy.py).
"pornhatcom": pornhat.extract,
# Freshporno KVS — `cv=` HMAC signed token IP-bound. Server-side resolve dało
# 200 z VPS, ale laptop dostał 302+SSL error → token validate'uje requester IP.
# Switch na WebView fallback: mobile pobiera embed page, KVS player decoduje
# video_url w-page, ExoPlayer dostaje URL z phone session. ~15k scen.
"freshpornoorg": _vps_blocked_fallback.extract,
# porn00 / pornxp — force_proxy=True wprost (IP-bound CDN). Switch na WebView
# fallback. Niski volume (84 scen), trivial saving ale konsystencja flow.
"porn00org": _vps_blocked_fallback.extract,
"pornxpph": _vps_blocked_fallback.extract,
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
# — używają identycznego embed-iframe pattern dla streamingu.
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
# cross-IP). Switch na WebView fallback. ~5k scen.
"0dayxxcom": _vps_blocked_fallback.extract,
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
"perverzijacom": _embed_iframe.extract,
# Special: WebView-only (Yii2 session-bound player).
"paradisehillcc": paradisehill.extract,
}
def try_extract(sitetag: str, page_url: str) -> list[StreamSource] | None:
"""Próbuje rozwiązać stream URL dla danego tube'a + page_url.
Zwraca listę StreamSource (różne quality/kontener) lub None gdy:
- brak extractora dla tego sitetag
- extractor zwrócił None / nie znalazł URL'a
Raises HosterDead gdy embed page wprost mówi że video deleted/not found
caller (playback.py) łapie i oznacza playback_source.dead_at.
"""
extractor = _REGISTRY.get(sitetag)
if extractor is None:
return None
try:
return extractor(page_url)
except (HosterDead, TubePageError):
raise
except Exception as e:
log.warning("extractor for %s failed on %s: %s", sitetag, page_url, e)
return None
def supported_sitetags() -> tuple[str, ...]:
"""Zwraca listę sitetag-ów które mają zarejestrowany extractor."""
return tuple(_REGISTRY.keys())
__all__ = [
"try_extract",
"supported_sitetags",
"StreamSource",
"HosterDead",
"TubePageError",
"extract_stream_from_hoster",
"unpack_packer",
"fetch_tube_html",
"browser_get",
]

120
app/extractors/_fetch.py Normal file
View file

@ -0,0 +1,120 @@
"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint.
Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3
TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym
UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego
Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza.
Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat
w razie problemów z buildem libcurl-impersonate).
"""
from __future__ import annotations
import logging
from collections.abc import Mapping
from dataclasses import dataclass
from urllib.parse import urlparse
import httpx
from app.extractors._models import TubePageError
log = logging.getLogger(__name__)
try:
from curl_cffi import requests as _cf_requests # type: ignore[import-not-found]
_HAS_CURL_CFFI = True
except ImportError: # pragma: no cover
_HAS_CURL_CFFI = False
log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)")
_DEFAULT_IMPERSONATE = "chrome120"
_DEFAULT_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
@dataclass
class FetchResult:
"""Mini response-like object — drop-in dla httpx.Response w naszych use case'ach."""
status_code: int
text: str
url: str
def raise_for_status(self) -> None:
if 400 <= self.status_code < 600:
raise TubePageError(self.status_code, self.url)
def browser_get(
url: str,
*,
headers: Mapping[str, str] | None = None,
timeout: float = 60.0,
follow_redirects: bool = True,
impersonate: str = _DEFAULT_IMPERSONATE,
) -> FetchResult:
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
if not _HAS_CURL_CFFI:
with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
r = http.get(url, headers=dict(headers or {}))
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
r = _cf_requests.get(
url,
headers=dict(headers or {}),
timeout=timeout,
impersonate=impersonate,
allow_redirects=follow_redirects,
)
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str:
"""Fetch HTML strony tube'a z Chrome UA + retry dla transient failures.
Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi
(browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach.
Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s).
Dla freshporno itp. które czasem zwracają 503/empty bez retry user dostawał
"extractor None" z transient hiccup.
"""
import time as _time
host = urlparse(url).hostname or ""
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"x-site": host,
}
last_err: Exception | None = None
for attempt in range(max_retries + 1):
try:
resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True)
except Exception as e:
last_err = e
log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e)
if attempt < max_retries:
_time.sleep(0.5 * (attempt + 1))
continue
raise
# Retry on 5xx (transient server error) lub puste body (CDN cache miss)
if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500):
if attempt < max_retries:
log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry",
url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text))
_time.sleep(0.5 * (attempt + 1))
continue
if resp.status_code >= 400:
raise TubePageError(resp.status_code, url)
return resp.text
if last_err:
raise last_err
raise TubePageError(0, url)
__all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]

48
app/extractors/_models.py Normal file
View file

@ -0,0 +1,48 @@
"""Stream source DTO + wspólne wyjątki extractorów."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
@dataclass
class StreamSource:
"""Pojedynczy resolved stream URL.
Mapuje na `StreamLink` w playback API (api/playback.py) `link` `stream_url`,
`quality` `quality`, `type` `type`.
`referer` opcjonalny override Referera używanego przez stream_proxy. Niektóre
CDN-y (KVS-style watchporn.to, fpo.xxx itp.) zwracają 410/403 gdy Referer nie
pasuje do *embed page'a* (np. proxy używa `Referer: 0dayxx.com` ale CDN expectuje
`Referer: watchporn.to`). Gdy None caller (playback.py) używa `page_url`.
"""
link: str
quality: str | None = None
type: str | None = None # 'mp4' | 'm3u8' | 'mpd' | 'hoster'
raw: dict[str, Any] | None = None
referer: str | None = None
class HosterDead(Exception):
"""Hoster embed page mówi że video jest skasowane / nie istnieje.
Caller w playback.py łapie i oznacza `playback_source.dead_at`.
"""
class TubePageError(Exception):
"""Tube page fetch zwrócił HTTP error (404/410/5xx).
Caller (playback.py) może oznaczyć dead_at jeśli 404/410. Trzymamy `status_code`
+ `url` w atrybutach żeby caller nie musiał parsować message stringa.
"""
def __init__(self, status_code: int, url: str):
super().__init__(f"HTTP {status_code} for {url}")
self.status_code = status_code
self.url = url
__all__ = ["StreamSource", "HosterDead", "TubePageError"]

View file

@ -0,0 +1,91 @@
"""Universal duration extractor for tube pages.
Direct scrapery (xvideos, xnxx, youporn, porntrex, ) search-only pobierają
listing i wycioskują tylko URL + slug-as-title. Duration pojawia się dopiero na
detail page i jest dostępne w jednym z patternów:
1. **OpenGraph numeric** (youporn, redtube, eporner):
`<meta property="og:video:duration" content="992">` sekundy.
2. **OpenGraph ISO 8601** (rzadkie):
`<meta property="og:video:duration" content="PT16M32S">`.
3. **Schema.org VideoObject LD-JSON** (xvideos, xnxx, KVS-based):
`"duration": "PT00H07M10S"` w JSON-LD `<script type="application/ld+json">`.
4. **itemprop microdata** (sxyland, 0dayxx, niektóre WordPress):
`<meta itemprop="duration" content="P0DT0H21M13S">` ISO 8601 z opcjonalnym
`P<days>D` prefix + opcjonalnym `T` blokiem HMS.
Funkcja zwraca pierwszy znaleziony match jako int seconds, lub None.
"""
from __future__ import annotations
import re
_OG_DURATION_RE = re.compile(
r'<meta\s+property="(?:og:(?:video:)?|video:)duration"\s+content="([^"]+)"',
re.IGNORECASE,
)
_LD_DURATION_RE = re.compile(r'"duration"\s*:\s*"(P[0-9DTHMS]+)"', re.IGNORECASE)
_ITEMPROP_DURATION_RE = re.compile(
r'itemprop="duration"[^>]*content="([^"]+)"', re.IGNORECASE
)
# Hqporner-style meta description: "Video duration is 6min 55sec" lub "1h 23min 5sec".
# Generic — pasuje też do innych tube'ów które dorzucają w meta opis duration prozą.
_META_DESC_DURATION_RE = re.compile(
r'(?:duration\s+is\s+|<meta\s+name="description"\s+content="[^"]*duration\s+is\s+)'
r'(?:(\d+)\s*h(?:our)?s?)?\s*(?:(\d+)\s*min)?\s*(?:(\d+)\s*sec)?',
re.IGNORECASE,
)
# Generalized ISO 8601: P[<n>D][T[<n>H][<n>M][<n>S]]. Pokrywa `PT16M32S`,
# `PT00H07M10S`, `P0DT0H21M13S` jednocześnie. Dni są rzadko sensowne (>24h scena),
# ale zachowujemy bo niektóre tube'y wpisują P0D dla porządku.
_ISO_DURATION_RE = re.compile(
r"^P(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$", re.IGNORECASE
)
def _parse_iso8601(value: str) -> int | None:
"""`P0DT0H21M13S` → 1273, `PT00H07M10S` → 430. None gdy format niepasujący
LUB total == 0 (sygnał placeholder bez duration)."""
m = _ISO_DURATION_RE.match(value.strip())
if not m:
return None
d, h, mi, s = (int(g) if g else 0 for g in m.groups())
total = d * 86400 + h * 3600 + mi * 60 + s
return total if total > 0 else None
def extract_duration_sec(html: str) -> int | None:
"""Zwraca duration w sekundach lub None gdy żaden wzorzec nie pasuje.
Kolejność: OG numeric OG ISO LD-JSON ISO itemprop ISO. Pierwsze pasujące
z `total > 0` wygrywa.
"""
if not html:
return None
if (m := _OG_DURATION_RE.search(html)):
v = m.group(1).strip()
if v.isdigit():
n = int(v)
if n > 0:
return n
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
return parsed
if (m := _LD_DURATION_RE.search(html)):
if (parsed := _parse_iso8601(m.group(1))) is not None:
return parsed
if (m := _ITEMPROP_DURATION_RE.search(html)):
v = m.group(1).strip()
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
return parsed
# Hqporner: "Video duration is 6min 55sec" w meta description.
if (m := _META_DESC_DURATION_RE.search(html)):
h, mi, s = (int(g) if g else 0 for g in m.groups())
total = h * 3600 + mi * 60 + s
if total > 0:
return total
return None

343
app/extractors/hoster.py Normal file
View file

@ -0,0 +1,343 @@
"""Generic hoster (StreamWish/doodporn/mixdrop/filemoon/luluvdo) stream URL extractor.
Hostery embed-page'y stosują JWPlayer + P.A.C.K.E.R. obfuskację:
eval(function(p,a,c,k,e,d){...}('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'),...))
i chowają `sources: [{file: "https://...m3u8"}]` w packed JS.
Tu jest:
- `unpack_packer(js)` dekoder P.A.C.K.E.R.
- `extract_stream_from_hoster(iframe_url, *, referer)` fetch embed unpack m3u8/mp4
Te funkcje używane przez:
1. Per-tube extractors (latestpornvideo, hqporner fallback) page embed iframe tu
2. Movies playback (api/playback.py movies_router) direct hoster URL tu
Nie ma już zależności od PornAppClient / porn-app API.
"""
from __future__ import annotations
import logging
import re
from app.extractors._fetch import _DEFAULT_UA, browser_get
from app.extractors._models import HosterDead
log = logging.getLogger(__name__)
# P.A.C.K.E.R. javascript unpacker — odwraca obfuskację wzorca:
# eval(function(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(...);return p}
# ('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'), 0, {}))
# StreamWish, doodporn, mixdrop, filemoon — wszystkie używają tego packera do schowania
# `sources: [{file: "https://...m3u8"}]` w JWPlayer config.
_PACKER_ARGS_RE = re.compile(
r"\}\s*\(\s*'((?:\\'|[^'])+)'\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*'((?:\\'|[^'])*)'\s*\.split\('\|'\)",
re.DOTALL,
)
def _base_n(token: str, base: int) -> int | None:
"""Parsuje token jako liczbę w bazie 'base' (max 62 dla a-zA-Z0-9)."""
try:
result = 0
for ch in token:
if ch.isdigit():
d = ord(ch) - ord("0")
elif "a" <= ch <= "z":
d = ord(ch) - ord("a") + 10
elif "A" <= ch <= "Z":
d = ord(ch) - ord("A") + 36
else:
return None
if d >= base:
return None
result = result * base + d
return result
except Exception:
return None
def unpack_packer(js: str) -> str | None:
"""Unpack P.A.C.K.E.R. obfuscated JS. Zwraca None gdy wzorca nie ma."""
m = _PACKER_ARGS_RE.search(js)
if not m:
return None
payload, base_str, count_str, kw_str = m.groups()
base = int(base_str)
count = int(count_str)
keywords = kw_str.split("|")
payload = payload.replace("\\'", "'").replace('\\"', '"').replace("\\\\", "\\")
def replace_token(match: re.Match[str]) -> str:
token = match.group(0)
idx = _base_n(token, base)
if idx is None or idx >= count or idx >= len(keywords):
return token
kw = keywords[idx]
return kw if kw else token
return re.sub(r"\b\w+\b", replace_token, payload)
_HOSTER_FILE_RE = re.compile(
r'(?:["\']?file["\']?|sources?)\s*[:=]\s*["\'](https?://[^"\']+\.(?:m3u8|mp4|mpd)[^"\']*)["\']',
re.IGNORECASE,
)
# Ad-rolls embedded w player config (xtremestream.xyz, niektóre KVS forki).
# Bez filtra extractor wracał preroll.mp4 jako "scena" → user widział 20s reklamy
# zamiast filmu (zgłoszone 2026-05-10, bug-report #30c4d3cf perverzija).
# Pattern obejmuje typowe nazwy ad-rolli + CDN-y które serwują reklamy
# (opencdn.b-cdn.net to bunnycdn alias dla reklam).
_AD_VIDEO_RE = re.compile(
r"/(?:preroll|midroll|postroll|preplay|ads?|advert|promo)\d*\.(?:mp4|m3u8|webm)"
r"|opencdn\.b-cdn\.net/video/(?:pre|mid|post|ad)",
re.IGNORECASE,
)
def _looks_like_ad(url: str) -> bool:
return bool(_AD_VIDEO_RE.search(url))
# Niektóre hostery (doodporn) chowają mp4/m3u8 w słowniku zmiennych i odwołują się do
# nich w `sources: [{file: links.hls2}]`. Wtedy regex powyżej nie złapie. Drugi pass
# bierze pierwszy `.m3u8|.mp4|.mpd` URL z całego unpacked HTML — heurystyka, ale
# pierwszy taki URL to zwykle master playlist video.
_HOSTER_FALLBACK_URL_RE = re.compile(
r'https?://[^\s"\'<>]+\.(?:m3u8|mp4|mpd)(?:\?[^\s"\'<>]*)?',
re.IGNORECASE,
)
# Sygnatury "video not found" / "deleted" które hostery wstawiają w HTML embed page.
# Gdy widzimy te markery, to wiemy że link jest martwy — raise HosterDead, caller w
# playback.py oznaczy playback_source.dead_at.
_HOSTER_DEAD_PATTERNS = (
"Video not found",
"video not found",
"Video Not Found",
"File was deleted",
"video is deleted",
"Video is deleted",
"This video is no longer available",
)
# KVS (Kernel Video Sharing) player markers — kt_player.js + license_code w HTML.
# Używają go fpo.xxx, 0day.kim, hdporn92, sxyland, i wiele innych WordPress-based
# tubes. KVS encryptuje URL `function/0/<encrypted>` license_code'em — regex fallback
# (`_HOSTER_FALLBACK_URL_RE`) złapie zamiast tego URL `event_reporting2` (tracking
# pixel zwracający 1×1 GIF zamiast video). Jak widzimy markery KVS, idziemy od razu
# do yt-dlp którego generic extractor poprawnie deszyfruje URL.
_KVS_MARKERS = ("kt_player(", "license_code")
# File hosters / known dead — rapidgator/nitroflare/frdl wymagają premium account
# (zwracają HTML z formularzem logowania zamiast video). Zwróć None bez fetch'u —
# caller w movies playback dorzuci embed-only fallback i mobile i tak otworzy
# WebView (gdzie user może zalogować się premium jeśli chce).
# Streamtape USUNIĘTY z blacklistu 2026-05-15 — ma dedicated extractor (innerHTML
# substring decode → /get_video → 302 → tapecontent.net mp4). Większość 12k URLów
# w naszej DB jest DMCA-dead ale ~5% żyje.
_FILE_HOSTER_RE = re.compile(
r"(?:rapidgator|nitroflare|filer\.net|frdl\.[a-z]+|"
r"streamcrypt\.net|"
r"openload\.co|openload\.io|oload\.[a-z]+)", # openload offline od 2019
re.IGNORECASE,
)
def extract_stream_from_hoster(
iframe_url: str,
*,
referer: str,
timeout: float = 60.0,
) -> str | None:
"""Fetch hoster embed HTML → unpack P.A.C.K.E.R. JS → wyłuskaj video URL.
Działa dla większości popularnych hosterów (StreamWish, doodporn, mixdrop, filemoon)
bo wszyscy oni hostują JWPlayer z `sources` w packed JS. Zwraca pierwszy znaleziony
URL .m3u8 / .mp4 / .mpd lub None gdy nie udało się wyciągnąć.
Raises HosterDead gdy embed page wprost mówi że video deleted/not found.
"""
if _FILE_HOSTER_RE.search(iframe_url):
log.debug("hoster %s: file-hoster blacklist (premium-walled), skipping", iframe_url)
return None
# Per-hoster dedicated extractors (specific URL shapes / decode patterns).
# Mixdrop: P.A.C.K.E.R. → MDCore.wurl protocol-relative `//host/v2/<id>.mp4?s=...`
# — generic packer fallback regex `https?://...\.mp4` mija ten URL (no scheme).
if re.search(r"(?:mixdrop|m1xdrop|mxdrop)\.[a-z]+/", iframe_url, re.IGNORECASE):
from app.extractors.hosters import mixdrop
sources = mixdrop.extract(iframe_url, timeout=timeout)
if sources:
return sources[0].link
# Fall through to generic logic gdyby dedicated zwrócił None.
# Streamtape: 4 `document.getElementById(...).innerHTML = prefix + (...).substring(N)`
# assignmenty, z czego 2 są DECOY z połamanym hostname. Dedicated decode picks
# correct one + builds `/get_video?id=...&token=...` URL.
if re.search(r"streamtape\.[a-z]+/", iframe_url, re.IGNORECASE):
from app.extractors.hosters import streamtape
sources = streamtape.extract(iframe_url, timeout=timeout)
if sources:
return sources[0].link
return None # streamtape ma własną HosterDead obsługę — generic fallback by się sypał
# Shared SPA+AES-CBC engine: embedseek/seekplayer/rpmplay/upns/player4me/easyvidplayer
# — wszystkie używają tego samego silnika (`/api/v1/video` z AES-CBC encrypted
# m3u8 source). Razem ~159k playback sources w DB.
from app.extractors.hosters import seekplayer_engine
if seekplayer_engine.matches(iframe_url):
sources = seekplayer_engine.extract(iframe_url, timeout=timeout)
if sources:
return sources[0].link
return None
# voe.sx: JS redirect do losowego mirroru → custom 7-step decoder
# (ROT13 → strip 7 magic seps → atob → -3 shift → reverse → atob → JSON.parse)
# → HLS m3u8 + mp4 fallback. ~21k movies.
if re.search(
r"//(?:voe\.sx|"
r"rebeccasciencestreet\.[a-z]+|"
r"darnobedienceupscale\.[a-z]+|"
r"[a-z]+upscale\.com|[a-z]+street\.com)/",
iframe_url,
re.IGNORECASE,
):
from app.extractors.hosters import voe
sources = voe.extract(iframe_url, timeout=timeout)
if sources:
return sources[0].link
return None
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"Referer": referer,
}
try:
r = browser_get(iframe_url, headers=headers, timeout=timeout, follow_redirects=True)
r.raise_for_status()
except Exception as e:
log.warning("hoster fetch %s failed: %s", iframe_url, e)
return None
html = r.text
if any(p in html for p in _HOSTER_DEAD_PATTERNS):
raise HosterDead(f"hoster {iframe_url} reports video deleted/not found")
def _first_non_ad(pattern: re.Pattern[str], text: str, group: int = 1) -> str | None:
"""Iterate matches, pomiń preroll/ad URLs. Zwraca pierwszy clean lub None."""
for m in pattern.finditer(text):
url = m.group(group)
if not _looks_like_ad(url):
return url
return None
# 1) Direct match w raw HTML (gdy hoster nie zaobfuskował)
if (url := _first_non_ad(_HOSTER_FILE_RE, html, 1)):
return url
# KVS player → idź od razu do yt-dlp żeby ominąć regex-fallback który łapie
# gif-trap URL `event_reporting2`. yt-dlp generic deszyfruje `function/0/<enc>`
# license_code'em i zwraca prawdziwy `get_file/<N>/...mp4` URL.
is_kvs = all(marker in html for marker in _KVS_MARKERS)
if is_kvs:
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
if ytdlp_url and not _looks_like_ad(ytdlp_url):
return ytdlp_url
log.warning("hoster %s: KVS markers but yt-dlp failed", iframe_url)
return None
# 2) Unpack P.A.C.K.E.R. → match na unpacked, najpierw structurally,
# potem fallback na pierwszy m3u8/mp4 w stringu.
unpacked = unpack_packer(html)
if unpacked:
if (url := _first_non_ad(_HOSTER_FILE_RE, unpacked, 1)):
return url
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, unpacked, 0)):
return url
# 3) Fallback na raw HTML (URL może być poza packerem)
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, html, 0)):
return url
# 4) yt-dlp last resort — battle-tested extractory dla streamtape, dood, mixdrop,
# filemoon, voe, vidoza, etc. Nie używamy go domyślnie (slow + lots of HTTP),
# tylko gdy nasze własne metody zawiodły.
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
if ytdlp_url:
return ytdlp_url
log.warning(
"hoster %s: no video URL in embed (packer unpack=%s, yt-dlp fail)",
iframe_url,
unpacked is not None,
)
return None
def _try_ytdlp_hoster(iframe_url: str, *, timeout: float) -> str | None:
"""yt-dlp wrapper dla hosters których nasz P.A.C.K.E.R. unpacker nie ogarnął.
yt-dlp ma extractory dla popularnych hosterów (streamtape, dood, mixdrop, filemoon,
voe, vidoza, streamwish, ...) bezpośredni dostęp do `_extract_info`. Te extractory
robią multi-step AJAX / token rotation / regex unpacking dla każdego hostera.
Catch-all exception handling: jeśli yt-dlp nie ma extractora dla tego hostera lub
coś się sypie (timeout, anti-bot blokada, format change), wracamy None i caller
spadnie do hoster-fallback (mobile WebView).
"""
try:
from yt_dlp import YoutubeDL
except ImportError:
return None
ydl_opts = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noplaylist": True,
"socket_timeout": int(timeout),
}
try:
with YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(iframe_url, download=False)
except Exception as e:
log.debug("yt-dlp hoster fallback failed for %s: %s", iframe_url, type(e).__name__)
return None
if info is None:
return None
def _looks_like_video_url(u: str | None) -> bool:
if not u:
return False
if _looks_like_ad(u):
return False
low = u.lower()
# Standardowe formaty video. yt-dlp generic czasem zwraca page URL jako
# "info[url]" gdy nie rozpoznał stream'a (np. xtremestream.xyz player
# bez KVS markers). Bez tego checka extractor wracał iframe URL jako
# "stream", mobile próbował go odtwarzać przez ExoPlayer i dostawał
# "fake video" lub błąd (zgłoszone 2026-05-10 #30c4d3cf perverzija).
return any(ext in low for ext in (".m3u8", ".mp4", ".mpd", ".webm", ".ts"))
# Best video format URL — yt-dlp już rankuje formats, pierwszy w `formats` zwykle jest
# najlepszy, albo `info["url"]` dla single-format extractorów.
formats = info.get("formats") or [info]
for fmt in formats:
if not isinstance(fmt, dict):
continue
url = fmt.get("url")
if _looks_like_video_url(url):
return url
# Fallback: top-level URL — ale tylko gdy faktycznie wygląda na video.
top = info.get("url")
if _looks_like_video_url(top):
return top
return None
__all__ = ["extract_stream_from_hoster", "unpack_packer", "HosterDead"]

View file

@ -0,0 +1,6 @@
"""Per-hoster dedicated extractors (mixdrop, voe, luluvid, etc.).
Dispatched z `app.extractors.hoster.extract_stream_from_hoster` na podstawie
URL hostname. Każdy moduł exportuje `extract(iframe_url, *, timeout)` list[StreamSource]
lub None.
"""

View file

@ -0,0 +1,82 @@
"""Mixdrop embed hoster — P.A.C.K.E.R. eval → MDCore.wurl direct mp4.
Pattern (verified 2026-05-15 via curl_cffi impersonate=chrome120):
1. Fetch `https://mixdrop.my/e/<id>` 200 z 95KB body, redirect 301 do
`https://m1xdrop.bz/e/<id>` (current TLD).
2. Body zawiera P.A.C.K.E.R. obfuscated JS block:
`eval(function(p,a,c,k,e,d){...}('...packed...',N,N,'...|...'.split('|'),0,{}))`
3. yt-dlp's `decode_packed_codes()` rozkrywa do ~390 chars JavaScript:
`MDCore.wurl="//a-delivery22.mxcontent.net/v2/<id>.mp4?s=<sig>&e=<exp>&_t=<ts>"`
4. URL na `mxcontent.net` zwraca **direct mp4** (Content-Type: video/mp4,
Content-Length: ~485MB) działa z Hetzner VPS IP, brak token IP-bind.
`s` to signed token (HMAC?), `e` to expiry timestamp (unix sec), `_t` to
issued timestamp. Token jest valid ~24h od `_t`. Refetching embed page po
expiry zwraca nowy URL.
Active mango movies: 203 playbacks origin='mangoporn:mixdrop' w DB.
"""
from __future__ import annotations
import logging
import re
from app.extractors._fetch import browser_get
from app.extractors._models import StreamSource
log = logging.getLogger(__name__)
_PACKER_RE = re.compile(
r"eval\(function\(p,a,c,k,e,d\)\{.+?\}\(.+?\)\)",
re.DOTALL,
)
_MP4_URL_RE = re.compile(r'MDCore\.wurl\s*=\s*"([^"]+\.mp4[^"]*)"')
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
res = browser_get(page_url, timeout=timeout)
if res.status_code != 200 or not res.text:
log.info("mixdrop: fetch fail status=%s url=%s", res.status_code, page_url)
return None
m = _PACKER_RE.search(res.text)
if not m:
log.info("mixdrop: no P.A.C.K.E.R. block in %s (page changed?)", page_url)
return None
try:
from yt_dlp.utils import decode_packed_codes
decoded = decode_packed_codes(m.group(0))
except Exception as e:
log.warning("mixdrop: decode_packed_codes failed: %s", e)
return None
url_match = _MP4_URL_RE.search(decoded)
if not url_match:
log.info("mixdrop: no MDCore.wurl in decoded payload (len=%d)", len(decoded))
return None
raw_url = url_match.group(1)
# URL z mixdrop często jest protocol-relative (`//a-delivery22...`).
if raw_url.startswith("//"):
raw_url = "https:" + raw_url
return [
StreamSource(
link=raw_url,
quality=None, # mixdrop nie listuje quality variants w MDCore
type="mp4",
referer="https://mixdrop.my/",
# mxcontent CDN wymaga **same-session cookies** z embed page +
# Chrome JA3. Backend `extract` zamyka sesję po fetch → mobile
# próbuje mp4 bez cookies → 403. Proxy MUSI re-fetchować embed
# w fresh curl_cffi session, extract nowy mp4 URL, stream.
# `refetch_url` w raw → token field `rf` → proxy refresh logic.
raw={
"proxy_impersonate": True,
"refetch_url": page_url, # embed page do re-extract
"refetch_hoster": "mixdrop",
},
)
]

View file

@ -0,0 +1,153 @@
"""Common engine extractor for: embedseek, seekplayer, rpmplay, upns, player4me, easyvidplayer.
Wszyscy używają tego samego silnika (Vite-built React SPA + AES-CBC encrypted API
+ HLS-based streaming). Hostname domains different ale shared backend.
Pattern (verified 2026-05-15 z residential PL + VPS Hetzner FI):
1. Embed URL = `https://<sub>.<host>.<tld>/#<hash_id>` — hash fragment to video ID.
SPA shell `Loading...` body load'uje `/assets/index-<n>.js` bundle.
2. JS fetcha `/api/v1/video?id=<hash_id>&w=<W>&h=<H>&r=` (W,H z window.screen).
Response: hex-encoded AES-CBC(key=`kiemtienmua911ca`, iv=`1234567890oiuytr`)
ciphertext, ~5KB. PKCS7 padded.
3. Plaintext JSON zawiera:
- `source`: signed m3u8 URL na CDN edge IP (np. `185.237.107.146/v4/<sig>/<exp>/ty/<hash>/master.m3u8?v=...`)
- `cf`: Cloudflare-fronted fallback URL (.txt z listą m3u8 paths)
- `metric.ipAddress`: IP visitora (signed token IP-bound do tego IP)
- `metric.cfDomain`: CF domain dla fallback
- `title`, `poster`, `thumbnail`, ...
4. `source` URL jest signed z visitor IP. Z VPS fetch zwraca master.m3u8 z signed
token tied to VPS IP proxy fetcha segments z tym samym tokenem, działa.
CDN port 443 z `verify=False` (self-signed IP cert).
5. Wszystkie hostery share te same wartości KEY/IV. Wewnętrzna obfuskacja JS
maskuje to lookupem `ue(773)`, `ue(686)` itp. derived bytes zawsze
identyczne dla każdej domeny.
Hostery covered (origin counts w DB, 2026-05-15):
- embedseek (20271), seekplayer (20271) mirror sites, dzielą hash_id
- rpmplay (15317)
- upns (14287)
- player4me (41040)
- easyvidplayer (47588)
Razem ~159k playback sources.
"""
from __future__ import annotations
import json
import logging
import re
from urllib.parse import urlparse
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from app.extractors._fetch import _DEFAULT_UA, browser_get
from app.extractors._models import HosterDead, StreamSource
log = logging.getLogger(__name__)
_KEY = b"kiemtienmua911ca"
_IV = b"1234567890oiuytr"
# Hostname matching: 6 base hosts × subdomains × TLD variants.
# Examples:
# my.embedseek.online, vip.seekplayer.vip, my.rpmplay.online,
# my.upns.online, vip.player4me.vip, p.easyvidplayer.com
_HOST_RE = re.compile(
r"^(?:[a-z0-9]+\.)?(?:embedseek|seekplayer|rpmplay|upns|player4me|easyvidplayer)\."
r"(?:online|vip|com|net|io|me|tv)$",
re.IGNORECASE,
)
def matches(url: str) -> bool:
try:
host = urlparse(url).hostname or ""
except Exception:
return False
return bool(_HOST_RE.match(host))
def _decrypt(hex_str: str) -> str:
ct = bytes.fromhex(hex_str)
cipher = Cipher(algorithms.AES(_KEY), modes.CBC(_IV))
dec = cipher.decryptor()
pt = dec.update(ct) + dec.finalize()
unpadder = padding.PKCS7(128).unpadder()
return (unpadder.update(pt) + unpadder.finalize()).decode("utf-8", errors="replace")
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
parsed = urlparse(page_url)
if not parsed.hostname or not _HOST_RE.match(parsed.hostname):
return None
# hash_id w `#<id>` fragmencie; gdy klient przekazał bez `#` (np. po nav.replace),
# spróbujemy też `?id=` query param.
hash_id = parsed.fragment.strip()
if not hash_id and parsed.query:
from urllib.parse import parse_qs
qs = parse_qs(parsed.query)
hash_id = (qs.get("id") or [""])[0]
if not hash_id:
log.info("seekplayer-engine: no hash_id w %s", page_url)
return None
host = f"{parsed.scheme}://{parsed.hostname}"
api_url = f"{host}/api/v1/video?id={hash_id}&w=1920&h=1080&r="
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "*/*",
"Referer": f"{host}/",
}
r = browser_get(api_url, headers=headers, timeout=timeout)
if r.status_code in (404, 410):
raise HosterDead(f"seekplayer-engine {page_url}: HTTP {r.status_code}")
if r.status_code != 200 or not r.text:
log.info("seekplayer-engine: api fail %s status=%s", api_url, r.status_code)
return None
try:
plaintext = _decrypt(r.text)
except Exception as e:
log.warning("seekplayer-engine: decrypt fail dla %s: %s", api_url, e)
return None
try:
data = json.loads(plaintext)
except Exception as e:
log.warning("seekplayer-engine: JSON parse fail dla %s: %s", api_url, e)
return None
# Hostery same-engine wracają `{"error": "..."}` gdy video nie istnieje.
if isinstance(data, dict) and data.get("error"):
raise HosterDead(f"seekplayer-engine {page_url}: {data['error']}")
source = (data.get("source") or "").strip()
cf = (data.get("cf") or "").strip()
# Source: IP-bound m3u8 URL na CDN edge (np. `185.237.107.146/v4/<sig>/<exp>/ty/<hash>/master.m3u8`).
# Token signed dla VPS IP — proxy poda segmenty z tego samego IP, OK.
# CDN servuje cert na IP — fetch wymaga verify=False (stream_proxy.py ma już
# taką gałąź dla IP-host m3u8).
sources: list[StreamSource] = []
if source:
sources.append(
StreamSource(
link=source,
quality=None,
type="m3u8",
referer=f"{host}/",
raw={
"proxy_no_verify": True,
"cf_fallback": cf or None,
"engine": "seekplayer",
},
)
)
return sources or None

View file

@ -0,0 +1,117 @@
"""Streamtape embed → direct mp4 extractor.
Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`):
1. Embed `/e/<id>` zwraca 89KB body z 4 `document.getElementById(...).innerHTML`
assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam
pattern:
document.getElementById('robotlink').innerHTML =
'//streamtape.com/get_video' +
('<junk>?id=<id>&expires=...&ip=...&token=...').substring(N).substring(M);
Junk to 3-4 znaki przed `?` substring(N).substring(M) je odcina.
2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...`
302 `https://<cluster>.tapecontent.net/radosgw/<id>/<signed_path>/<title>.mp4`
(direct mp4, video/mp4 ~500MB, brak IP-bind).
3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!`
większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise
HosterDead, caller w playback.py oznaczy dead_at.
Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`.
"""
from __future__ import annotations
import logging
import re
from app.extractors._fetch import _DEFAULT_UA, browser_get
from app.extractors._models import HosterDead, StreamSource
log = logging.getLogger(__name__)
# Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);`
# Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs
# z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL.
# Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g`
# — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na
# podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`.
_ASSIGN_RE = re.compile(
r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*"
r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]"
r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?"
r"\(['\"](?P<suffix>[^'\"]+)['\"]\)"
r"(?P<slices>(?:\.substring\(\d+\))+)",
re.IGNORECASE,
)
_SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)")
_NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE)
def _apply_slices(suffix: str, slices_str: str) -> str:
out = suffix
for m in _SUBSTRING_RE.finditer(slices_str):
n = int(m.group(1))
out = out[n:]
return out
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
}
r = browser_get(page_url, headers=headers, timeout=timeout)
if r.status_code in (404, 410):
raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}")
if r.status_code != 200 or not r.text:
log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code)
return None
if _NOT_FOUND_RE.search(r.text):
raise HosterDead(f"streamtape {page_url}: Video not found")
# Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa.
# `get_video` może być w prefix (residential variant) lub split prefix+suffix
# (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`).
final_url: str | None = None
for m in _ASSIGN_RE.finditer(r.text):
prefix = m.group("prefix").strip()
suffix = m.group("suffix")
slices = m.group("slices")
tail = _apply_slices(suffix, slices)
combined = prefix + tail
# Normalize: dodaj `https:` jeśli URL zaczyna się od `//`
if combined.startswith("//"):
url = "https:" + combined
elif combined.startswith("/"):
url = "https:/" + combined # `/streamtape.com/...` → `https://streamtape.com/...`
else:
url = combined
# Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`).
if (
"streamtape.com/get_video?" in url
and "id=" in url
and "token=" in url
):
final_url = url
break
if not final_url:
log.info("streamtape: no valid innerHTML assignment found in %s", page_url)
return None
return [
StreamSource(
link=final_url,
quality=None,
type="mp4",
referer=page_url,
# /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi
# follow redirect (stream_proxy domyślnie follow_redirects=True).
raw={"redirect_via": "streamtape_get_video"},
)
]

Some files were not shown because too many files have changed in this diff Show more