Initial commit
Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
This commit is contained in:
commit
ad0284585b
329 changed files with 51795 additions and 0 deletions
29
.env.example
Normal file
29
.env.example
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
POSTGRES_USER=goon
|
||||||
|
POSTGRES_PASSWORD=goon
|
||||||
|
POSTGRES_DB=goon
|
||||||
|
POSTGRES_PORT=5432
|
||||||
|
|
||||||
|
API_PORT=8000
|
||||||
|
|
||||||
|
DATABASE_URL=postgresql+psycopg://goon:goon@localhost:5432/goon
|
||||||
|
|
||||||
|
# TPDB (theporndb.net) — required for canonical scene metadata + performer canonicalization.
|
||||||
|
# Get token from your TPDB account settings.
|
||||||
|
TPDB_API_TOKEN=
|
||||||
|
TPDB_BASE_URL=https://api.theporndb.net
|
||||||
|
|
||||||
|
# StashDB — second canonical source. Required for full performer/scene cross-source dedup.
|
||||||
|
STASHDB_API_KEY=
|
||||||
|
STASHDB_GRAPHQL_URL=https://stashdb.org/graphql
|
||||||
|
|
||||||
|
LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# Comma-separated list of API keys. Empty = auth disabled (only safe for localhost).
|
||||||
|
# Generate with: python -c "import secrets; print(secrets.token_urlsafe(32))"
|
||||||
|
API_KEYS=
|
||||||
|
|
||||||
|
# Sentry observability — empty = init no-op (no telemetry sent).
|
||||||
|
# Set your own DSN if you self-host Sentry or use cloud free tier.
|
||||||
|
SENTRY_DSN=
|
||||||
|
SENTRY_ENVIRONMENT=dev
|
||||||
|
SENTRY_TRACES_SAMPLE_RATE=0.1
|
||||||
32
.github/workflows/backend-tests.yml
vendored
Normal file
32
.github/workflows/backend-tests.yml
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
name: Backend tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 10
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
cache: 'pip'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -e .[dev]
|
||||||
|
|
||||||
|
- name: Lint (ruff)
|
||||||
|
run: ruff check app/ tests/
|
||||||
|
|
||||||
|
- name: Run pytest
|
||||||
|
run: pytest --tb=short
|
||||||
85
.github/workflows/build-apk.yml
vendored
Normal file
85
.github/workflows/build-apk.yml
vendored
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
name: Build Android APK
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 30
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Node
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: '20'
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: mobile/package-lock.json
|
||||||
|
|
||||||
|
- name: Setup Java
|
||||||
|
uses: actions/setup-java@v4
|
||||||
|
with:
|
||||||
|
distribution: 'temurin'
|
||||||
|
java-version: '17'
|
||||||
|
|
||||||
|
- name: Setup Gradle cache
|
||||||
|
uses: gradle/actions/setup-gradle@v4
|
||||||
|
|
||||||
|
- name: Install npm dependencies
|
||||||
|
working-directory: mobile
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Pre-bundle JS for debug embedding
|
||||||
|
# Default RN debug builds don't embed JS bundle (expects Metro server).
|
||||||
|
# We explicitly run Expo's `export:embed` so the resulting APK works
|
||||||
|
# standalone on a phone without Metro running. This is also where
|
||||||
|
# `EXPO_PUBLIC_*` env vars get inlined into the bundle.
|
||||||
|
working-directory: mobile
|
||||||
|
env:
|
||||||
|
EXPO_PUBLIC_SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
|
||||||
|
EXPO_PUBLIC_SENTRY_ENVIRONMENT: ${{ vars.SENTRY_ENVIRONMENT || 'production' }}
|
||||||
|
run: |
|
||||||
|
mkdir -p android/app/src/main/assets android/app/src/main/res
|
||||||
|
node node_modules/@expo/cli/build/bin/cli export:embed \
|
||||||
|
--platform android \
|
||||||
|
--dev false \
|
||||||
|
--bundle-output android/app/src/main/assets/index.android.bundle \
|
||||||
|
--assets-dest android/app/src/main/res
|
||||||
|
|
||||||
|
- name: Build debug APK
|
||||||
|
working-directory: mobile/android
|
||||||
|
run: ./gradlew assembleDebug --no-daemon
|
||||||
|
env:
|
||||||
|
NODE_OPTIONS: --max_old_space_size=4096
|
||||||
|
|
||||||
|
- name: Rename APK with version
|
||||||
|
id: rename
|
||||||
|
working-directory: mobile/android/app/build/outputs/apk/debug
|
||||||
|
run: |
|
||||||
|
REF_NAME="${{ github.ref_name }}"
|
||||||
|
# Sanitize ref → safe filename component
|
||||||
|
VERSION="${REF_NAME//[^a-zA-Z0-9._-]/_}"
|
||||||
|
mv app-debug.apk "goon-${VERSION}-debug.apk"
|
||||||
|
echo "apk=mobile/android/app/build/outputs/apk/debug/goon-${VERSION}-debug.apk" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Upload APK artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: goon-apk-${{ github.ref_name }}
|
||||||
|
path: ${{ steps.rename.outputs.apk }}
|
||||||
|
retention-days: 30
|
||||||
|
|
||||||
|
- name: Attach APK to GitHub Release
|
||||||
|
if: startsWith(github.ref, 'refs/tags/')
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: ${{ steps.rename.outputs.apk }}
|
||||||
|
fail_on_unmatched_files: true
|
||||||
|
generate_release_notes: true
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
77
.gitignore
vendored
Normal file
77
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
mobile/.env
|
||||||
|
mobile/.env.local
|
||||||
|
*.pyc
|
||||||
|
__pycache__/
|
||||||
|
.pytest_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
*.egg-info/
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.sqlite
|
||||||
|
*.db
|
||||||
|
|
||||||
|
# Personal operational notes (deploy state, in-progress notes per session)
|
||||||
|
DEPLOY_BACKLOG.md
|
||||||
|
|
||||||
|
# Mobile (Expo / React Native)
|
||||||
|
mobile/node_modules/
|
||||||
|
mobile/.expo/
|
||||||
|
mobile/dist/
|
||||||
|
mobile/web-build/
|
||||||
|
mobile/android/.gradle/
|
||||||
|
mobile/android/app/build/
|
||||||
|
mobile/android/build/
|
||||||
|
mobile/ios/build/
|
||||||
|
mobile/ios/Pods/
|
||||||
|
mobile/*.jks
|
||||||
|
mobile/*.keystore
|
||||||
|
|
||||||
|
# Mobile build artefakty (regenerowane przy `gradlew assembleDebug` przez expo
|
||||||
|
# `export:embed`). NIE commitować — psuje rebuilds (gradle merguje stale bundle
|
||||||
|
# zamiast generować świeży, patrz session 2026-05-07).
|
||||||
|
mobile/android/app/src/main/assets/index.android.bundle
|
||||||
|
mobile/android/app/src/main/res/drawable-*/
|
||||||
|
mobile/android/app/src/main/res/raw/
|
||||||
|
|
||||||
|
# yt-dlp / scrapers cache
|
||||||
|
.yt-dlp-cache/
|
||||||
|
|
||||||
|
# Reverse-engineered third-party APKs (AIO Streamer dekompilacja — kept locally for
|
||||||
|
# debugging the legacy porn-app auth flow, but MUST NOT enter public git history;
|
||||||
|
# distributing decompiled proprietary code violates copyright/EULA).
|
||||||
|
re/
|
||||||
|
|
||||||
|
# DB dumps (operacyjne backupy, mogą zawierać user data)
|
||||||
|
*.dump
|
||||||
|
*.sql.gz
|
||||||
|
|
||||||
|
# Built APKs (release/debug binaries — distributed via GitHub Releases instead)
|
||||||
|
*.apk
|
||||||
|
|
||||||
|
# Claude Code session data (transcripts/agents — local only)
|
||||||
|
.claude/
|
||||||
|
|
||||||
|
# Operacyjne logi inputu / debug per-session
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Per-user runtime artefakty NIE do publicznego repo
|
||||||
|
.iclaude
|
||||||
|
wa-logs.txt
|
||||||
|
mcp-logs.txt
|
||||||
|
|
||||||
|
# ADB / development debug artefakty (screenshots, ui dumps)
|
||||||
|
.tmp_adb/
|
||||||
|
|
||||||
|
# Operational deploy scripts — moved to a private companion repo. Public repo
|
||||||
|
# should NOT contain SSH commands, systemd units, or smoke-test playbooks
|
||||||
|
# referencing concrete hosts.
|
||||||
|
deploy/
|
||||||
120
CONTRIBUTING.md
Normal file
120
CONTRIBUTING.md
Normal file
|
|
@ -0,0 +1,120 @@
|
||||||
|
# Contributing to Goon
|
||||||
|
|
||||||
|
## Development setup
|
||||||
|
|
||||||
|
Goon backend is Python 3.12+, FastAPI + SQLAlchemy + APScheduler + Postgres.
|
||||||
|
Mobile client is React Native + Expo.
|
||||||
|
|
||||||
|
### Backend
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create virtualenv
|
||||||
|
python -m venv .venv
|
||||||
|
. .venv/bin/activate # or .venv\Scripts\activate on Windows
|
||||||
|
|
||||||
|
# Install with dev extras
|
||||||
|
pip install -e .[dev]
|
||||||
|
|
||||||
|
# Bring up postgres (or use docker-compose; see README)
|
||||||
|
# Adjust DATABASE_URL in .env if needed
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Run migrations
|
||||||
|
alembic upgrade head
|
||||||
|
|
||||||
|
# Run API
|
||||||
|
uvicorn app.main:app --reload --port 8000
|
||||||
|
|
||||||
|
# Run worker (separate terminal)
|
||||||
|
python -m app.scheduler.worker # full scheduler
|
||||||
|
python -m app.scheduler.worker --once --source=tpdb --limit=50 # one-shot ingest
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mobile
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd mobile
|
||||||
|
npm install
|
||||||
|
npm start # opens Expo dev server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest # full suite (~70 tests, <5s)
|
||||||
|
pytest tests/test_resolve_*.py -v
|
||||||
|
ruff check app/
|
||||||
|
mypy app/ # optional, CI-only
|
||||||
|
```
|
||||||
|
|
||||||
|
PRs must pass `pytest` + `ruff check`. Run them locally before pushing.
|
||||||
|
|
||||||
|
## Code style
|
||||||
|
|
||||||
|
- **Formatting**: ruff (config in `pyproject.toml`). Line length 100.
|
||||||
|
- **Type hints**: required on public functions. `from __future__ import annotations`
|
||||||
|
in every module.
|
||||||
|
- **Docstrings**: write the **why**, not the **what**. Reference real bugs/incidents
|
||||||
|
when explaining non-obvious code paths.
|
||||||
|
- **Comments**: only when the code can't speak for itself. Prefer renaming a
|
||||||
|
variable over adding a comment that explains it.
|
||||||
|
- **No dead code, no commented-out code, no TODO without an issue link.**
|
||||||
|
- **Polish or English in comments**: existing code is mostly Polish in
|
||||||
|
comments and English in code (function/class/var names). New code can be
|
||||||
|
either, but be consistent within a file.
|
||||||
|
|
||||||
|
## Adding a new tube extractor / scraper
|
||||||
|
|
||||||
|
If you want Goon to support an additional adult tube site:
|
||||||
|
|
||||||
|
1. **Stream extractor** (`app/extractors/tubes/`): given a scene page URL,
|
||||||
|
return a list of `StreamSource` (m3u8/mp4 URLs with quality labels).
|
||||||
|
- Mainstream tubes: try `_ytdlp.extract` (yt-dlp covers ~30 tubes out of
|
||||||
|
the box — just register the sitetag in `app/extractors/__init__.py`).
|
||||||
|
- WordPress-like tubes with embed iframe: register `_embed_iframe.extract`.
|
||||||
|
- Custom player / signed URLs / token rotation: write your own per-tube
|
||||||
|
module (see `hqporner.py`, `eporner.py`, `sxyprn.py` as references).
|
||||||
|
|
||||||
|
2. **Discovery scraper** (`app/connectors/direct_scrapers/`): subclass
|
||||||
|
`BaseSearchScraper`, set `sitetag`, `_search_url_template`, `_scene_url_re`.
|
||||||
|
Most aggregator tubes can fit in 10-20 lines (see `xmoviesforyou.py`).
|
||||||
|
|
||||||
|
3. **Register** the scraper class in `ALL_DIRECT_SCRAPERS` in
|
||||||
|
`app/connectors/direct_scrapers/__init__.py`.
|
||||||
|
|
||||||
|
4. **Test** with one performer name that you know has scenes on that tube:
|
||||||
|
```bash
|
||||||
|
python -m app.scheduler.worker --once --strategy=performer-driven \
|
||||||
|
--performers="Some Performer" --sitetags=<your-sitetag>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database migrations
|
||||||
|
|
||||||
|
Use Alembic:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
alembic revision -m "describe change" # new migration
|
||||||
|
alembic upgrade head # apply
|
||||||
|
alembic downgrade -1 # roll back one
|
||||||
|
```
|
||||||
|
|
||||||
|
Every migration must have a working `downgrade()`. We don't ship squashed
|
||||||
|
migrations — full history is the source of truth.
|
||||||
|
|
||||||
|
## What we won't merge
|
||||||
|
|
||||||
|
- **Adult-content moderation features** (auto-tagging by detected acts,
|
||||||
|
content filtering by performer attributes, etc.) — out of scope.
|
||||||
|
- **Hardcoded credentials, API keys, or device IDs** in source — must be
|
||||||
|
env-driven.
|
||||||
|
- **Bypassing tube paywalls / DRM / auth** — Goon only scrapes publicly
|
||||||
|
accessible search pages.
|
||||||
|
- **Telemetry or analytics that report user activity to third parties**.
|
||||||
|
Sentry is opt-in (`SENTRY_DSN` empty by default).
|
||||||
|
- **Public deployment recipes** (e.g. nginx config for an open instance).
|
||||||
|
Goon is self-hosted only — see [DISCLAIMER.md](./DISCLAIMER.md).
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
By contributing, you agree your contributions are licensed under the MIT
|
||||||
|
License (see [LICENSE](./LICENSE)).
|
||||||
62
DISCLAIMER.md
Normal file
62
DISCLAIMER.md
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
# Disclaimer
|
||||||
|
|
||||||
|
## Adult Content (18+)
|
||||||
|
|
||||||
|
Goon is a self-hosted aggregator for adult-content scene metadata. The software
|
||||||
|
itself contains no media — it indexes metadata from third-party sources
|
||||||
|
(TheporndB, StashDB, public adult tube sites) and links to those sources for
|
||||||
|
playback.
|
||||||
|
|
||||||
|
By using, hosting, or distributing this software you affirm that:
|
||||||
|
|
||||||
|
- You are at least 18 years of age (or the age of legal majority in your
|
||||||
|
jurisdiction, whichever is greater).
|
||||||
|
- Adult content is legal to view, store metadata about, and access in your
|
||||||
|
jurisdiction.
|
||||||
|
- You are solely responsible for compliance with all applicable laws,
|
||||||
|
including (but not limited to) record-keeping requirements (e.g. 18 U.S.C.
|
||||||
|
§ 2257 in the United States) and content classification rules.
|
||||||
|
|
||||||
|
## Self-Hosting Only
|
||||||
|
|
||||||
|
This software is intended for **self-hosting on infrastructure you control**.
|
||||||
|
Operating a public-facing instance accessible to unauthenticated users is
|
||||||
|
**not the intended use case** and may expose you to legal liability for
|
||||||
|
content delivery, age verification, and data protection.
|
||||||
|
|
||||||
|
If you operate a publicly accessible instance you are entirely responsible for
|
||||||
|
implementing the age verification, geo-restrictions, content moderation, ToS,
|
||||||
|
and privacy controls that your jurisdiction requires.
|
||||||
|
|
||||||
|
## Third-Party Sources
|
||||||
|
|
||||||
|
Goon scrapes publicly accessible search/listing pages from adult tube sites
|
||||||
|
to build its index. By configuring those scrapers and pointing them at a
|
||||||
|
target tube you accept that:
|
||||||
|
|
||||||
|
- Tube sites' Terms of Service may prohibit automated access. Respect their
|
||||||
|
rate limits and `robots.txt`. Goon does not bypass paywalls, authentication,
|
||||||
|
or DRM.
|
||||||
|
- Tube sites may at any time change their HTML, block your IP, or disable
|
||||||
|
features Goon depends on. Discovery and stream resolution are best-effort.
|
||||||
|
- The metadata Goon stores (titles, performer names, duration, thumbnails)
|
||||||
|
is sourced from those tubes and may contain inaccuracies, NSFW filenames,
|
||||||
|
or content the tube has since removed. Reporting takedown requests is your
|
||||||
|
responsibility — Goon ships no takedown workflow.
|
||||||
|
|
||||||
|
## No Warranty
|
||||||
|
|
||||||
|
This software is provided "AS IS" without warranty of any kind. See
|
||||||
|
[LICENSE](./LICENSE) for full terms. The authors and contributors are not
|
||||||
|
liable for any damages, losses, or legal consequences arising from use of
|
||||||
|
this software.
|
||||||
|
|
||||||
|
## Reporting Issues
|
||||||
|
|
||||||
|
For security issues affecting the software itself (auth bypass, RCE, secret
|
||||||
|
leak): open a private security advisory on the GitHub repository.
|
||||||
|
|
||||||
|
For takedown requests, content concerns, or jurisdiction-specific compliance
|
||||||
|
questions: contact the operator of the specific instance — Goon contributors
|
||||||
|
are not in a position to take action on third-party content surfaced by
|
||||||
|
self-hosted deployments.
|
||||||
22
Dockerfile
Normal file
22
Dockerfile
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
PYTHONPATH=/srv
|
||||||
|
|
||||||
|
WORKDIR /srv
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
RUN pip install --upgrade pip \
|
||||||
|
&& pip install -e .[dev]
|
||||||
|
|
||||||
|
COPY app ./app
|
||||||
|
COPY alembic ./alembic
|
||||||
|
COPY alembic.ini ./
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2026 Goon contributors
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
261
README.md
Normal file
261
README.md
Normal file
|
|
@ -0,0 +1,261 @@
|
||||||
|
# Goon
|
||||||
|
|
||||||
|
Self-hosted aggregator for adult-content scene metadata. Indexes scenes from
|
||||||
|
TheporndB, StashDB, and 30+ public adult tube sites; deduplicates across
|
||||||
|
sources; serves an API + mobile (React Native) client for browsing and
|
||||||
|
linking out to playback.
|
||||||
|
|
||||||
|
> **18+ ONLY · Self-hosted only · See [DISCLAIMER.md](./DISCLAIMER.md) before
|
||||||
|
> hosting an instance.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What it does
|
||||||
|
|
||||||
|
- **Multi-source ingest**: pulls canonical scene/performer/studio metadata from
|
||||||
|
TPDB and StashDB on a delta cron, merges duplicates by performer + title +
|
||||||
|
date heuristics (perceptual hash + Levenshtein title distance).
|
||||||
|
- **Tube discovery**: per-performer search across 30+ public adult tube sites
|
||||||
|
(mainstream + aggregators). Each tube is scraped directly via HTTP — no
|
||||||
|
proprietary API dependencies.
|
||||||
|
- **Stream resolution on demand**: when a user clicks Watch, the API extracts
|
||||||
|
a fresh m3u8/mp4 URL from the tube's page (or falls back to embed link for
|
||||||
|
WebView playback). Mainstream tubes use yt-dlp; aggregator tubes use a
|
||||||
|
generic P.A.C.K.E.R. unpacker for JWPlayer-based hosters
|
||||||
|
(StreamWish/doodporn/mixdrop/...).
|
||||||
|
- **Mobile client** (Expo / React Native): scene grid, performer pages, watch
|
||||||
|
history, favorites, hold-to-preview animated thumbnails.
|
||||||
|
- **Performer-driven backfill**: a continuous worker walks performers ordered
|
||||||
|
by `last_searched_at NULLS FIRST` and back-fills tube scenes for the
|
||||||
|
longest-stale performer first.
|
||||||
|
|
||||||
|
## What it doesn't do
|
||||||
|
|
||||||
|
- Host or store any media. Scene metadata + thumbnail URLs only.
|
||||||
|
- Bypass paywalls, authentication, geo-blocks, or DRM.
|
||||||
|
- Provide age verification, ToS gating, or moderation for public deployments.
|
||||||
|
See [DISCLAIMER.md](./DISCLAIMER.md).
|
||||||
|
- Phone home. Sentry telemetry is opt-in (env var, empty by default).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
### 1. Run the backend (Docker)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <repo-url> goon
|
||||||
|
cd goon
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env:
|
||||||
|
# - TPDB_API_TOKEN (theporndb.net account → API tokens)
|
||||||
|
# - STASHDB_API_KEY (stashdb.org account → API keys)
|
||||||
|
# - API_KEYS (generate one: python -c "import secrets; print(secrets.token_urlsafe(32))")
|
||||||
|
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
Three services come up: `db` (Postgres 16), `api` (FastAPI on `:8000`,
|
||||||
|
auto-applies migrations on startup), `worker` (APScheduler running TPDB/StashDB
|
||||||
|
delta + performer-driven backfill).
|
||||||
|
|
||||||
|
Verify: `curl localhost:8000/health` → `{"status":"ok"}`.
|
||||||
|
|
||||||
|
### 2. Install the mobile app (Android)
|
||||||
|
|
||||||
|
Download the latest debug APK from
|
||||||
|
[GitHub Releases](../../releases/latest) → `goon-vX.Y.Z-debug.apk`, install on
|
||||||
|
your Android device (allow "Install from unknown sources" for the browser /
|
||||||
|
file manager you used to download).
|
||||||
|
|
||||||
|
On first launch the app shows the age-gate disclaimer (must be accepted), then
|
||||||
|
a login screen. Enter:
|
||||||
|
- **Backend URL**: `http://<your-backend-host>:8000` (e.g. your LAN IP, or
|
||||||
|
`http://localhost:8000` if running on the device — uncommon)
|
||||||
|
- **API key**: one of the values you put in `API_KEYS` in `.env`
|
||||||
|
|
||||||
|
That's it.
|
||||||
|
|
||||||
|
### Local Python (no Docker)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv .venv && . .venv/bin/activate # or .\.venv\Scripts\activate on Windows
|
||||||
|
pip install -e .[dev]
|
||||||
|
cp .env.example .env # edit creds
|
||||||
|
alembic upgrade head
|
||||||
|
uvicorn app.main:app --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Worker (manual one-shot ingest)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Foreground APScheduler with all jobs
|
||||||
|
python -m app.scheduler.worker
|
||||||
|
|
||||||
|
# One-shot:
|
||||||
|
python -m app.scheduler.worker --once --source=tpdb --limit=200
|
||||||
|
python -m app.scheduler.worker --once --strategy=performer-driven --top-n=20
|
||||||
|
python -m app.scheduler.worker --once --strategy=performer-driven \
|
||||||
|
--performers="Lola Noir,Mia Malkova"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building the APK locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd mobile
|
||||||
|
npm install
|
||||||
|
cd android
|
||||||
|
./gradlew assembleDebug
|
||||||
|
# output: mobile/android/app/build/outputs/apk/debug/app-debug.apk
|
||||||
|
```
|
||||||
|
|
||||||
|
Or just push a `v*` tag — GitHub Actions builds and attaches the APK to the
|
||||||
|
Release ([.github/workflows/build-apk.yml](./.github/workflows/build-apk.yml)).
|
||||||
|
|
||||||
|
### Sentry telemetry (optional)
|
||||||
|
|
||||||
|
Default behavior: **no telemetry**. Sentry only initializes when a DSN is
|
||||||
|
present at runtime/build time.
|
||||||
|
|
||||||
|
To enable Sentry for **your** instance (errors only, no PII, no replay):
|
||||||
|
|
||||||
|
- **Backend**: set `SENTRY_DSN=https://...` in `.env` (gitignored).
|
||||||
|
Optionally `SENTRY_ENVIRONMENT=production` and `SENTRY_TRACES_SAMPLE_RATE=0.1`.
|
||||||
|
- **Mobile (local builds)**: create `mobile/.env` (gitignored) with
|
||||||
|
`EXPO_PUBLIC_SENTRY_DSN=https://...`. Expo SDK 49+ auto-inlines `EXPO_PUBLIC_*`
|
||||||
|
vars into the JS bundle at build time.
|
||||||
|
- **Mobile (CI builds)**: add a GitHub repository secret named `SENTRY_DSN`.
|
||||||
|
The APK workflow exports it as `EXPO_PUBLIC_SENTRY_DSN` to gradle. Without
|
||||||
|
the secret, the APK ships with telemetry disabled (forks of this repo don't
|
||||||
|
inherit your DSN).
|
||||||
|
|
||||||
|
Sentry init is gated by `if (SENTRY_DSN) { Sentry.init(...) }` — empty DSN
|
||||||
|
means the SDK is loaded as dead code but never sends a single request.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
All runtime config is environment variables (see [.env.example](./.env.example)
|
||||||
|
for the full list). Highlights:
|
||||||
|
|
||||||
|
| Var | Default | Required? | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `DATABASE_URL` | `postgresql+psycopg://goon:goon@localhost:5432/goon` | Yes | Postgres 14+ |
|
||||||
|
| `TPDB_API_TOKEN` | _empty_ | For TPDB ingest | Get from theporndb.net account |
|
||||||
|
| `STASHDB_API_KEY` | _empty_ | For StashDB ingest | Get from stashdb.org account |
|
||||||
|
| `API_KEYS` | _empty_ | Recommended | CSV of allowed API keys; empty = no auth (localhost-only) |
|
||||||
|
| `SENTRY_DSN` | _empty_ | No | Empty = no telemetry. Use your own DSN if you want crash reports. |
|
||||||
|
| `LOG_LEVEL` | `INFO` | No | DEBUG for verbose tube scraping logs |
|
||||||
|
|
||||||
|
Scheduler tuning (set to `0` to disable a job):
|
||||||
|
|
||||||
|
| Var | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `GOON_SCHED_TPDB_HOURS` | `6` | TPDB delta interval |
|
||||||
|
| `GOON_SCHED_STASHDB_HOURS` | `6` | StashDB delta interval |
|
||||||
|
| `GOON_SCHED_PERFORMER_DRIVEN_HOURS` | `12` | Top-N performer ingest |
|
||||||
|
| `GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS` | `15` | Continuous backfill tick |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture (high level)
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────┐ delta cron ┌────────────┐
|
||||||
|
│ TPDB │────────────────▶│ │
|
||||||
|
└──────────┘ │ │
|
||||||
|
┌──────────┐ │ ingest │
|
||||||
|
│ StashDB │────────────────▶│ pipeline │──┐
|
||||||
|
└──────────┘ │ │ │ cross-source
|
||||||
|
┌──────────┐ performer- │ │ ▼ dedup +
|
||||||
|
│ ~25 tube │ driven ┌───▶│ │ ┌─────────┐
|
||||||
|
│ sites │ search │ └────────────┘ │ Postgres│
|
||||||
|
└──────────┘────────────┘ └─────────┘
|
||||||
|
│
|
||||||
|
┌─────────────────────────────────────────────┤
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────┐ ┌──────────────┐
|
||||||
|
│ FastAPI │ │ Worker │
|
||||||
|
│ /scenes │◀────── on Watch click ───│ scheduler │
|
||||||
|
│ /performers │ resolve stream URL │ (APScheduler)│
|
||||||
|
│ /playback │ (yt-dlp / hoster └──────────────┘
|
||||||
|
└──────────────┘ packer)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Expo mobile │
|
||||||
|
│ (Android) │
|
||||||
|
└──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Key modules:
|
||||||
|
|
||||||
|
- [`app/connectors/`](./app/connectors/) — TPDB, StashDB, dooplay (movies),
|
||||||
|
paradisehill (movies), [`direct_scrapers/`](./app/connectors/direct_scrapers/)
|
||||||
|
(25 tube discovery scrapers).
|
||||||
|
- [`app/extractors/`](./app/extractors/) — stream URL resolution per tube.
|
||||||
|
yt-dlp wrapper + custom + generic embed-iframe + P.A.C.K.E.R. unpacker.
|
||||||
|
- [`app/resolve/`](./app/resolve/) — cross-source scene merging (phash, title
|
||||||
|
similarity, performer overlap, release date window).
|
||||||
|
- [`app/scheduler/`](./app/scheduler/) — APScheduler jobs +
|
||||||
|
[`performer_driven.py`](./app/scheduler/performer_driven.py) (the core
|
||||||
|
ingest strategy: completeness > recency).
|
||||||
|
- [`mobile/`](./mobile/) — Expo / React Native client.
|
||||||
|
|
||||||
|
## Tube coverage
|
||||||
|
|
||||||
|
Discovery + stream resolution registered for ~33 sources:
|
||||||
|
|
||||||
|
**Mainstream tubes:** pornhub, redtube, xhamster, xvideos, xnxx, youporn,
|
||||||
|
eporner, hqporner, sxyprn, porntrex, pornhat.
|
||||||
|
|
||||||
|
**Aggregators / mirrors:** xmoviesforyou, watchporn, siska, porn4days,
|
||||||
|
porndish, xxxfreewatch, latestleaks, latestpornvideo, mypornerleak,
|
||||||
|
porndittcom, hdporn92, sxyland, 0dayxx, perverzija, fpoxxx, porn00, pornxp,
|
||||||
|
hdporngg, fullmovies, freshporno, shyfap.
|
||||||
|
|
||||||
|
**Movie sites:** paradisehill (primary) + dooplay mirrors (mangoporn,
|
||||||
|
streamporn, pandamovies).
|
||||||
|
|
||||||
|
If you want to add another tube, see [CONTRIBUTING.md](./CONTRIBUTING.md).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Support the project
|
||||||
|
|
||||||
|
Goon is free, open-source, and ad-free. It stays that way because donations
|
||||||
|
cover the VPS, the TPDB/StashDB tokens, and the time. **Crypto only** —
|
||||||
|
mainstream processors refuse adult projects, even FOSS tooling.
|
||||||
|
|
||||||
|
In-app: **Scenes → ♥** opens a screen with QR codes for Monero, Bitcoin, and
|
||||||
|
USDT (TRC-20).
|
||||||
|
|
||||||
|
Addresses are hardcoded in
|
||||||
|
[`mobile/src/lib/donate.ts`](./mobile/src/lib/donate.ts) so a compromised
|
||||||
|
server cannot swap them mid-donation. Verify the value on-screen against the
|
||||||
|
copy in this repo before sending.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Roadmap
|
||||||
|
|
||||||
|
Near-term:
|
||||||
|
|
||||||
|
- Browse-by-performer + sort-by-studio
|
||||||
|
- Multi-tag filter (AND / OR)
|
||||||
|
- Continue-watching rail (position sync across devices)
|
||||||
|
- Stash local-server bridge — sync favorites/watchlist with a self-hosted Stash
|
||||||
|
- iOS sideload via TestFlight invite
|
||||||
|
|
||||||
|
Mid-term:
|
||||||
|
|
||||||
|
- Web companion (read-only browser frontend over the same API)
|
||||||
|
- BTCPay Server invoicing for one-time / recurring donations
|
||||||
|
- Performer-alert notifications (server push when a favorited performer drops a new scene)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT — see [LICENSE](./LICENSE).
|
||||||
45
alembic.ini
Normal file
45
alembic.ini
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
[alembic]
|
||||||
|
script_location = alembic
|
||||||
|
prepend_sys_path = .
|
||||||
|
version_path_separator = os
|
||||||
|
file_template = %%(year)d%%(month).2d%%(day).2d_%%(hour).2d%%(minute).2d_%%(rev)s_%%(slug)s
|
||||||
|
|
||||||
|
[post_write_hooks]
|
||||||
|
hooks = ruff
|
||||||
|
ruff.type = console_scripts
|
||||||
|
ruff.entrypoint = ruff
|
||||||
|
ruff.options = format REVISION_SCRIPT_FILENAME
|
||||||
|
|
||||||
|
[loggers]
|
||||||
|
keys = root,sqlalchemy,alembic
|
||||||
|
|
||||||
|
[handlers]
|
||||||
|
keys = console
|
||||||
|
|
||||||
|
[formatters]
|
||||||
|
keys = generic
|
||||||
|
|
||||||
|
[logger_root]
|
||||||
|
level = WARN
|
||||||
|
handlers = console
|
||||||
|
qualname =
|
||||||
|
|
||||||
|
[logger_sqlalchemy]
|
||||||
|
level = WARN
|
||||||
|
handlers =
|
||||||
|
qualname = sqlalchemy.engine
|
||||||
|
|
||||||
|
[logger_alembic]
|
||||||
|
level = INFO
|
||||||
|
handlers =
|
||||||
|
qualname = alembic
|
||||||
|
|
||||||
|
[handler_console]
|
||||||
|
class = StreamHandler
|
||||||
|
args = (sys.stderr,)
|
||||||
|
level = NOTSET
|
||||||
|
formatter = generic
|
||||||
|
|
||||||
|
[formatter_generic]
|
||||||
|
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||||
|
datefmt = %H:%M:%S
|
||||||
52
alembic/env.py
Normal file
52
alembic/env.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
from logging.config import fileConfig
|
||||||
|
|
||||||
|
from alembic import context
|
||||||
|
from sqlalchemy import engine_from_config, pool
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.models import Base
|
||||||
|
|
||||||
|
config = context.config
|
||||||
|
|
||||||
|
if config.config_file_name is not None:
|
||||||
|
fileConfig(config.config_file_name)
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||||
|
|
||||||
|
target_metadata = Base.metadata
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_offline() -> None:
|
||||||
|
url = config.get_main_option("sqlalchemy.url")
|
||||||
|
context.configure(
|
||||||
|
url=url,
|
||||||
|
target_metadata=target_metadata,
|
||||||
|
literal_binds=True,
|
||||||
|
dialect_opts={"paramstyle": "named"},
|
||||||
|
compare_type=True,
|
||||||
|
)
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_online() -> None:
|
||||||
|
connectable = engine_from_config(
|
||||||
|
config.get_section(config.config_ini_section, {}),
|
||||||
|
prefix="sqlalchemy.",
|
||||||
|
poolclass=pool.NullPool,
|
||||||
|
)
|
||||||
|
with connectable.connect() as connection:
|
||||||
|
context.configure(
|
||||||
|
connection=connection,
|
||||||
|
target_metadata=target_metadata,
|
||||||
|
compare_type=True,
|
||||||
|
)
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
if context.is_offline_mode():
|
||||||
|
run_migrations_offline()
|
||||||
|
else:
|
||||||
|
run_migrations_online()
|
||||||
3
alembic/init/00_extensions.sql
Normal file
3
alembic/init/00_extensions.sql
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||||
|
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pgcrypto;
|
||||||
25
alembic/script.py.mako
Normal file
25
alembic/script.py.mako
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
"""${message}
|
||||||
|
|
||||||
|
Revision ID: ${up_revision}
|
||||||
|
Revises: ${down_revision | comma,n}
|
||||||
|
Create Date: ${create_date}
|
||||||
|
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
${imports if imports else ""}
|
||||||
|
|
||||||
|
revision: str = ${repr(up_revision)}
|
||||||
|
down_revision: str | None = ${repr(down_revision)}
|
||||||
|
branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
|
||||||
|
depends_on: str | Sequence[str] | None = ${repr(depends_on)}
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
${upgrades if upgrades else "pass"}
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
${downgrades if downgrades else "pass"}
|
||||||
313
alembic/versions/20260502_0001_initial.py
Normal file
313
alembic/versions/20260502_0001_initial.py
Normal file
|
|
@ -0,0 +1,313 @@
|
||||||
|
"""initial schema
|
||||||
|
|
||||||
|
Revision ID: 0001_initial
|
||||||
|
Revises:
|
||||||
|
Create Date: 2026-05-02
|
||||||
|
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0001_initial"
|
||||||
|
down_revision: str | None = None
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# `create_type=False` bo enum tworzymy raz jawnie poniżej; używanie tej samej instancji
|
||||||
|
# w wielu kolumnach z `create_type=True` próbowałoby tworzyć typ wielokrotnie.
|
||||||
|
SOURCE_KIND = postgresql.ENUM(
|
||||||
|
"tpdb", "stashdb", "scraper", "porn_app", "manual",
|
||||||
|
name="source_kind", create_type=False,
|
||||||
|
)
|
||||||
|
ENTITY_KIND = postgresql.ENUM(
|
||||||
|
"scene", "performer", "studio", "tag",
|
||||||
|
name="entity_kind", create_type=False,
|
||||||
|
)
|
||||||
|
PERFORMER_GENDER = postgresql.ENUM(
|
||||||
|
"female", "male", "transgender_female", "transgender_male",
|
||||||
|
"non_binary", "intersex", "unknown",
|
||||||
|
name="performer_gender", create_type=False,
|
||||||
|
)
|
||||||
|
FINGERPRINT_KIND = postgresql.ENUM(
|
||||||
|
"phash", "oshash", "md5", name="fingerprint_kind", create_type=False,
|
||||||
|
)
|
||||||
|
MERGE_KIND = postgresql.ENUM(
|
||||||
|
"scene", "performer", "studio", name="merge_kind", create_type=False,
|
||||||
|
)
|
||||||
|
MERGE_STATUS = postgresql.ENUM(
|
||||||
|
"pending", "auto_merged", "merged", "rejected",
|
||||||
|
name="merge_status", create_type=False,
|
||||||
|
)
|
||||||
|
INGEST_STATUS = postgresql.ENUM(
|
||||||
|
"running", "success", "partial", "failed",
|
||||||
|
name="ingest_status", create_type=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
|
||||||
|
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
|
||||||
|
op.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto;")
|
||||||
|
|
||||||
|
SOURCE_KIND.create(op.get_bind(), checkfirst=True)
|
||||||
|
ENTITY_KIND.create(op.get_bind(), checkfirst=True)
|
||||||
|
PERFORMER_GENDER.create(op.get_bind(), checkfirst=True)
|
||||||
|
FINGERPRINT_KIND.create(op.get_bind(), checkfirst=True)
|
||||||
|
MERGE_KIND.create(op.get_bind(), checkfirst=True)
|
||||||
|
MERGE_STATUS.create(op.get_bind(), checkfirst=True)
|
||||||
|
INGEST_STATUS.create(op.get_bind(), checkfirst=True)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"sources",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("kind", SOURCE_KIND, nullable=False),
|
||||||
|
sa.Column("name", sa.String(128), nullable=False, unique=True),
|
||||||
|
sa.Column("base_url", sa.String(512)),
|
||||||
|
sa.Column("auth_secret_ref", sa.String(128)),
|
||||||
|
sa.Column("weight", sa.Float, nullable=False, server_default="1.0"),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"studios",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("name", sa.String(256), nullable=False),
|
||||||
|
sa.Column("name_normalized", sa.String(256), nullable=False),
|
||||||
|
sa.Column("slug", sa.String(256), nullable=False, unique=True),
|
||||||
|
sa.Column("parent_studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
|
||||||
|
sa.Column("network", sa.String(256)),
|
||||||
|
sa.Column("homepage_url", sa.String(512)),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_studios_name_normalized", "studios", ["name_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_studios_name_normalized_trgm ON studios "
|
||||||
|
"USING GIN (name_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"studio_aliases",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("alias", sa.String(256), nullable=False),
|
||||||
|
sa.Column("alias_normalized", sa.String(256), nullable=False),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
|
||||||
|
sa.UniqueConstraint("studio_id", "alias_normalized", name="uq_studio_aliases_studio_id_alias_normalized"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_studio_aliases_studio_id", "studio_aliases", ["studio_id"])
|
||||||
|
op.create_index("ix_studio_aliases_alias_normalized", "studio_aliases", ["alias_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_studio_aliases_alias_normalized_trgm ON studio_aliases "
|
||||||
|
"USING GIN (alias_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"studio_external_refs",
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("external_id", sa.String(256), primary_key=True),
|
||||||
|
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
|
||||||
|
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_studio_external_refs_studio_id", "studio_external_refs", ["studio_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"performers",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("canonical_name", sa.String(256), nullable=False),
|
||||||
|
sa.Column("name_normalized", sa.String(256), nullable=False),
|
||||||
|
sa.Column("slug", sa.String(256), nullable=False, unique=True),
|
||||||
|
sa.Column("gender", PERFORMER_GENDER),
|
||||||
|
sa.Column("birth_date", sa.Date),
|
||||||
|
sa.Column("country", sa.String(64)),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_performers_name_normalized", "performers", ["name_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_performers_name_normalized_trgm ON performers "
|
||||||
|
"USING GIN (name_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"performer_aliases",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("alias", sa.String(256), nullable=False),
|
||||||
|
sa.Column("alias_normalized", sa.String(256), nullable=False),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
|
||||||
|
sa.UniqueConstraint("performer_id", "alias_normalized", name="uq_performer_aliases_performer_id_alias_normalized"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_performer_aliases_performer_id", "performer_aliases", ["performer_id"])
|
||||||
|
op.create_index("ix_performer_aliases_alias_normalized", "performer_aliases", ["alias_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_performer_aliases_alias_normalized_trgm ON performer_aliases "
|
||||||
|
"USING GIN (alias_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"performer_external_refs",
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("external_id", sa.String(256), primary_key=True),
|
||||||
|
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
|
||||||
|
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_performer_external_refs_performer_id", "performer_external_refs", ["performer_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"tags",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("name", sa.String(128), nullable=False),
|
||||||
|
sa.Column("slug", sa.String(128), nullable=False, unique=True),
|
||||||
|
sa.Column("parent_tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="SET NULL")),
|
||||||
|
sa.Column("description", sa.String(1024)),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"scenes",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("title", sa.String(512), nullable=False),
|
||||||
|
sa.Column("title_normalized", sa.String(512), nullable=False),
|
||||||
|
sa.Column("slug", sa.String(512)),
|
||||||
|
sa.Column("release_date", sa.Date),
|
||||||
|
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
|
||||||
|
sa.Column("duration_sec", sa.Integer),
|
||||||
|
sa.Column("description", sa.Text),
|
||||||
|
sa.Column("code", sa.String(128)),
|
||||||
|
sa.Column("director", sa.String(256)),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_scenes_title_normalized", "scenes", ["title_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_scenes_title_normalized_trgm ON scenes "
|
||||||
|
"USING GIN (title_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
op.create_index("ix_scenes_release_date", "scenes", ["release_date"])
|
||||||
|
op.create_index("ix_scenes_slug", "scenes", ["slug"])
|
||||||
|
op.create_index("ix_scenes_studio_id", "scenes", ["studio_id"])
|
||||||
|
op.create_index("ix_scenes_code", "scenes", ["code"])
|
||||||
|
op.create_index("ix_scenes_studio_release_date", "scenes", ["studio_id", "release_date"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"scene_external_refs",
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("external_id", sa.String(256), primary_key=True),
|
||||||
|
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
|
||||||
|
sa.Column("url", sa.String(1024)),
|
||||||
|
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_scene_external_refs_scene_id", "scene_external_refs", ["scene_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"scene_fingerprints",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("kind", FINGERPRINT_KIND, nullable=False),
|
||||||
|
sa.Column("value", sa.String(128), nullable=False),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
|
||||||
|
sa.UniqueConstraint("scene_id", "kind", "value", name="uq_scene_fingerprints_scene_id_kind_value"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_scene_fingerprints_scene_id", "scene_fingerprints", ["scene_id"])
|
||||||
|
op.create_index("ix_scene_fingerprints_value", "scene_fingerprints", ["value"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"scene_performers",
|
||||||
|
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("role", sa.String(64)),
|
||||||
|
sa.Column("position", sa.Integer),
|
||||||
|
sa.Column("as_alias", sa.String(256)),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"scene_tags",
|
||||||
|
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"external_records",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("entity_kind", ENTITY_KIND, nullable=False),
|
||||||
|
sa.Column("external_id", sa.String(256), nullable=False),
|
||||||
|
sa.Column("raw", postgresql.JSONB, nullable=False),
|
||||||
|
sa.Column("raw_hash", sa.LargeBinary(32), nullable=False),
|
||||||
|
sa.Column("fetched_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("last_seen_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.UniqueConstraint("source_id", "entity_kind", "external_id", name="uq_external_records_source_id_entity_kind_external_id"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_external_records_source_id", "external_records", ["source_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"merge_candidates",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("kind", MERGE_KIND, nullable=False),
|
||||||
|
sa.Column("left_id", postgresql.UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("right_id", postgresql.UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("score", sa.Float, nullable=False),
|
||||||
|
sa.Column("reasons", postgresql.JSONB, nullable=False, server_default="{}"),
|
||||||
|
sa.Column("status", MERGE_STATUS, nullable=False, server_default="pending"),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("resolved_at", sa.DateTime(timezone=True)),
|
||||||
|
sa.Column("resolved_by", sa.String(128)),
|
||||||
|
)
|
||||||
|
op.create_index("ix_merge_candidates_left_id", "merge_candidates", ["left_id"])
|
||||||
|
op.create_index("ix_merge_candidates_right_id", "merge_candidates", ["right_id"])
|
||||||
|
op.create_index("ix_merge_candidates_status", "merge_candidates", ["status"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"ingest_runs",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("started_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("finished_at", sa.DateTime(timezone=True)),
|
||||||
|
sa.Column("status", INGEST_STATUS, nullable=False, server_default="running"),
|
||||||
|
sa.Column("records_seen", sa.Integer, nullable=False, server_default="0"),
|
||||||
|
sa.Column("records_new", sa.Integer, nullable=False, server_default="0"),
|
||||||
|
sa.Column("records_updated", sa.Integer, nullable=False, server_default="0"),
|
||||||
|
sa.Column("errors", postgresql.JSONB),
|
||||||
|
)
|
||||||
|
op.create_index("ix_ingest_runs_source_id", "ingest_runs", ["source_id"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("ingest_runs")
|
||||||
|
op.drop_table("merge_candidates")
|
||||||
|
op.drop_table("external_records")
|
||||||
|
op.drop_table("scene_tags")
|
||||||
|
op.drop_table("scene_performers")
|
||||||
|
op.drop_table("scene_fingerprints")
|
||||||
|
op.drop_table("scene_external_refs")
|
||||||
|
op.drop_table("scenes")
|
||||||
|
op.drop_table("tags")
|
||||||
|
op.drop_table("performer_external_refs")
|
||||||
|
op.drop_table("performer_aliases")
|
||||||
|
op.drop_table("performers")
|
||||||
|
op.drop_table("studio_external_refs")
|
||||||
|
op.drop_table("studio_aliases")
|
||||||
|
op.drop_table("studios")
|
||||||
|
op.drop_table("sources")
|
||||||
|
|
||||||
|
INGEST_STATUS.drop(op.get_bind(), checkfirst=True)
|
||||||
|
MERGE_STATUS.drop(op.get_bind(), checkfirst=True)
|
||||||
|
MERGE_KIND.drop(op.get_bind(), checkfirst=True)
|
||||||
|
FINGERPRINT_KIND.drop(op.get_bind(), checkfirst=True)
|
||||||
|
PERFORMER_GENDER.drop(op.get_bind(), checkfirst=True)
|
||||||
|
ENTITY_KIND.drop(op.get_bind(), checkfirst=True)
|
||||||
|
SOURCE_KIND.drop(op.get_bind(), checkfirst=True)
|
||||||
67
alembic/versions/20260502_0002_playback_sources.py
Normal file
67
alembic/versions/20260502_0002_playback_sources.py
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
"""playback_sources table for tube/aggregator video links
|
||||||
|
|
||||||
|
Revision ID: 0002_playback_sources
|
||||||
|
Revises: 0001_initial
|
||||||
|
Create Date: 2026-05-02
|
||||||
|
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0002_playback_sources"
|
||||||
|
down_revision: str | None = "0001_initial"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"playback_sources",
|
||||||
|
sa.Column(
|
||||||
|
"id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
primary_key=True,
|
||||||
|
server_default=sa.text("gen_random_uuid()"),
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"scene_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column("origin", sa.String(64), nullable=False),
|
||||||
|
sa.Column("page_url", sa.String(2048), nullable=False),
|
||||||
|
sa.Column("embed_url", sa.String(2048)),
|
||||||
|
sa.Column("stream_url", sa.String(2048)),
|
||||||
|
sa.Column("quality", sa.String(16)),
|
||||||
|
sa.Column("duration_sec", sa.Integer),
|
||||||
|
sa.Column("thumbnail_url", sa.String(2048)),
|
||||||
|
sa.Column(
|
||||||
|
"last_seen_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("NOW()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("NOW()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"updated_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("NOW()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.UniqueConstraint("origin", "page_url", name="uq_playback_sources_origin_page_url"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_playback_sources_scene_id", "playback_sources", ["scene_id"])
|
||||||
|
op.create_index("ix_playback_sources_origin", "playback_sources", ["origin"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("playback_sources")
|
||||||
41
alembic/versions/20260503_0003_playback_dead.py
Normal file
41
alembic/versions/20260503_0003_playback_dead.py
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""playback_sources.dead_at + dead_reason — flagging dead tube links
|
||||||
|
|
||||||
|
Revision ID: 0003_playback_dead
|
||||||
|
Revises: 0002_playback_sources
|
||||||
|
Create Date: 2026-05-03
|
||||||
|
|
||||||
|
Gdy resolve endpoint dostanie 404 "Video is offline" / "deleted" z porn-app,
|
||||||
|
oznaczamy ten playback_source jako martwy. API filtruje go z `_build_scene_out`,
|
||||||
|
mobile go nie pokazuje. has_playback=true filter też wymaga `dead_at IS NULL`.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0003_playback_dead"
|
||||||
|
down_revision: str | None = "0002_playback_sources"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.add_column(
|
||||||
|
"playback_sources",
|
||||||
|
sa.Column("dead_at", sa.DateTime(timezone=True), nullable=True),
|
||||||
|
)
|
||||||
|
op.add_column(
|
||||||
|
"playback_sources",
|
||||||
|
sa.Column("dead_reason", sa.String(length=512), nullable=True),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"ix_playback_sources_dead_at",
|
||||||
|
"playback_sources",
|
||||||
|
["dead_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_playback_sources_dead_at", table_name="playback_sources")
|
||||||
|
op.drop_column("playback_sources", "dead_reason")
|
||||||
|
op.drop_column("playback_sources", "dead_at")
|
||||||
34
alembic/versions/20260504_0004_animated_thumbnail.py
Normal file
34
alembic/versions/20260504_0004_animated_thumbnail.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
"""playback_sources.animated_thumbnail_url — animowane miniaturki dla hold-to-preview
|
||||||
|
|
||||||
|
Revision ID: 0004_animated_thumbnail
|
||||||
|
Revises: 0003_playback_dead
|
||||||
|
Create Date: 2026-05-04
|
||||||
|
|
||||||
|
Mobile (`ScenesScreen`, `MergeQueueScreen`) ma hold-to-preview: po przytrzymaniu kciuka
|
||||||
|
na thumbie pokazuje animowany webp/gif zamiast statycznego obrazka. Pole jest opcjonalne —
|
||||||
|
nie każde źródło tube je dostarcza; jeśli null → mobile fallbackuje do `thumbnail_url`.
|
||||||
|
|
||||||
|
Bez tej kolumny endpointy które ją zwracały (admin merge-candidates, scene detail) musiały
|
||||||
|
być sztucznie ograniczane (vide DEPLOY_BACKLOG.md). Po tej migracji można wrócić do
|
||||||
|
pełnej projekcji w `app/api/admin.py`.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0004_animated_thumbnail"
|
||||||
|
down_revision: str | None = "0003_playback_dead"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.add_column(
|
||||||
|
"playback_sources",
|
||||||
|
sa.Column("animated_thumbnail_url", sa.String(length=2048), nullable=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_column("playback_sources", "animated_thumbnail_url")
|
||||||
38
alembic/versions/20260504_0005_favorite_performers.py
Normal file
38
alembic/versions/20260504_0005_favorite_performers.py
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
"""favorite_performers — ulubione performerki (single-user, in-app)
|
||||||
|
|
||||||
|
Revision ID: 0005_favorite_performers
|
||||||
|
Revises: 0004_animated_thumbnail
|
||||||
|
Create Date: 2026-05-04
|
||||||
|
|
||||||
|
Single-user system (brak users), więc tabelka to po prostu zbiór performer_id które
|
||||||
|
user oznaczył jako ulubione, plus `last_seen_at` żeby mobile mogło policzyć ile nowych
|
||||||
|
scen pojawiło się od ostatniego oglądania (badge w toolbar/Favorites screen).
|
||||||
|
|
||||||
|
Multi-user można dodać potem (kolumna user_id + composite PK), bez breaking change.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0005_favorite_performers"
|
||||||
|
down_revision: str | None = "0004_animated_thumbnail"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"favorite_performers",
|
||||||
|
sa.Column("performer_id", sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("performers.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(), nullable=False),
|
||||||
|
sa.Column("last_seen_at", sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("favorite_performers")
|
||||||
41
alembic/versions/20260504_0006_blacklists.py
Normal file
41
alembic/versions/20260504_0006_blacklists.py
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""Blacklists — performers/studios/tags do globalnego ukrywania.
|
||||||
|
|
||||||
|
Revision ID: 0006_blacklists
|
||||||
|
Revises: 0005_favorite_performers
|
||||||
|
Create Date: 2026-05-04
|
||||||
|
|
||||||
|
Single-user; analogicznie do favorite_performers ale negative — sceny które MAJĄ
|
||||||
|
blacklisted performer / studio / tag są wykluczane ze wszystkich list (scenes,
|
||||||
|
search, performer/tag scenes).
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0006_blacklists"
|
||||||
|
down_revision: str | None = "0005_favorite_performers"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
for tbl, parent_tbl, parent_col in [
|
||||||
|
("blacklisted_performers", "performers", "performer_id"),
|
||||||
|
("blacklisted_studios", "studios", "studio_id"),
|
||||||
|
("blacklisted_tags", "tags", "tag_id"),
|
||||||
|
]:
|
||||||
|
op.create_table(
|
||||||
|
tbl,
|
||||||
|
sa.Column(parent_col, sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey(f"{parent_tbl}.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("blacklisted_tags")
|
||||||
|
op.drop_table("blacklisted_studios")
|
||||||
|
op.drop_table("blacklisted_performers")
|
||||||
42
alembic/versions/20260504_0007_play_progress.py
Normal file
42
alembic/versions/20260504_0007_play_progress.py
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
"""scene_play_progress — pozycja odtwarzania per scena (continue watching).
|
||||||
|
|
||||||
|
Revision ID: 0007_play_progress
|
||||||
|
Revises: 0006_blacklists
|
||||||
|
Create Date: 2026-05-04
|
||||||
|
|
||||||
|
Single-user; tabela trzyma ostatnio oglądane sceny + (gdy player zwróci) pozycję
|
||||||
|
w sekundach. Continue watching rail na home pobiera top-N ostatnich.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0007_play_progress"
|
||||||
|
down_revision: str | None = "0006_blacklists"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"scene_play_progress",
|
||||||
|
sa.Column("scene_id", sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True),
|
||||||
|
sa.Column("position_sec", sa.Integer(), nullable=False, server_default="0"),
|
||||||
|
sa.Column("duration_sec", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("finished", sa.Boolean(), nullable=False, server_default=sa.false()),
|
||||||
|
sa.Column("last_played_at", sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"ix_scene_play_progress_last_played_at",
|
||||||
|
"scene_play_progress",
|
||||||
|
["last_played_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_scene_play_progress_last_played_at", table_name="scene_play_progress")
|
||||||
|
op.drop_table("scene_play_progress")
|
||||||
42
alembic/versions/20260506_0008_performer_search_meta.py
Normal file
42
alembic/versions/20260506_0008_performer_search_meta.py
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
"""Performer.last_searched_at + search_run_count — backfill queue dla per-performer search.
|
||||||
|
|
||||||
|
Revision ID: 0008_performer_search_meta
|
||||||
|
Revises: 0007_play_progress
|
||||||
|
Create Date: 2026-05-06
|
||||||
|
|
||||||
|
Continuous worker iteruje performerów ORDER BY last_searched_at NULLS FIRST,
|
||||||
|
search_run_count ASC. Performerów którzy nigdy nie byli searchowani idą pierwsi.
|
||||||
|
Po pełnym sweep'ie kolejka cyklicznie wraca do najstarszych.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0008_performer_search_meta"
|
||||||
|
down_revision: str | None = "0007_play_progress"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.add_column(
|
||||||
|
"performers",
|
||||||
|
sa.Column("last_searched_at", sa.DateTime(timezone=True), nullable=True),
|
||||||
|
)
|
||||||
|
op.add_column(
|
||||||
|
"performers",
|
||||||
|
sa.Column("search_run_count", sa.Integer(), nullable=False, server_default="0"),
|
||||||
|
)
|
||||||
|
# Index dla queue: NULLS FIRST + search_run_count ASC. PostgreSQL btree
|
||||||
|
# default DESC ma NULLS FIRST. Asc - NULLS LAST. Robimy explicit.
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_performers_search_priority "
|
||||||
|
"ON performers (last_searched_at ASC NULLS FIRST, search_run_count ASC)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_performers_search_priority", table_name="performers")
|
||||||
|
op.drop_column("performers", "search_run_count")
|
||||||
|
op.drop_column("performers", "last_searched_at")
|
||||||
146
alembic/versions/20260506_0009_movies.py
Normal file
146
alembic/versions/20260506_0009_movies.py
Normal file
|
|
@ -0,0 +1,146 @@
|
||||||
|
"""movies kanon + bliźniacze tabele do scen
|
||||||
|
|
||||||
|
Revision ID: 0009_movies
|
||||||
|
Revises: 0008_performer_search_meta
|
||||||
|
Create Date: 2026-05-06
|
||||||
|
|
||||||
|
Schema dla full-length adult films (paradisehill + mirrory). Movies różnią się od
|
||||||
|
scen: 60-180min runtime, multi-chapter struktura, więcej metadanych (director,
|
||||||
|
year, country, rating). Performers/studios/tags reusable (te same osoby/studia
|
||||||
|
występują w scenach i w filmach).
|
||||||
|
|
||||||
|
Nowe entity_kind: 'movie'. Nowe merge_kind: 'movie'. Movie-fingerprints rzadko
|
||||||
|
istnieją (movies nie mają standardowego pHash w industry), więc fingerprint table
|
||||||
|
pomijamy — dedup pójdzie po composite key (title+year+studio+cast Jaccard).
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0009_movies"
|
||||||
|
down_revision: str | None = "0008_performer_search_meta"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
# Rozszerz enumy o 'movie'
|
||||||
|
op.execute("ALTER TYPE entity_kind ADD VALUE IF NOT EXISTS 'movie'")
|
||||||
|
op.execute("ALTER TYPE merge_kind ADD VALUE IF NOT EXISTS 'movie'")
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"movies",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("title", sa.String(512), nullable=False),
|
||||||
|
sa.Column("title_normalized", sa.String(512), nullable=False),
|
||||||
|
sa.Column("slug", sa.String(512)),
|
||||||
|
sa.Column("release_year", sa.Integer),
|
||||||
|
sa.Column("release_date", sa.Date),
|
||||||
|
sa.Column("studio_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("studios.id", ondelete="SET NULL")),
|
||||||
|
sa.Column("director", sa.String(256)),
|
||||||
|
sa.Column("country", sa.String(64)),
|
||||||
|
sa.Column("duration_sec", sa.Integer),
|
||||||
|
sa.Column("description", sa.Text),
|
||||||
|
sa.Column("poster_url", sa.String(2048)),
|
||||||
|
sa.Column("backdrop_url", sa.String(2048)),
|
||||||
|
# Rating jako float (paradisehill ma like_count + rating 0-10; trzymamy
|
||||||
|
# uśredniony rating z primary source'a, jeśli dostępny).
|
||||||
|
sa.Column("rating", sa.Float),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_movies_title_normalized", "movies", ["title_normalized"])
|
||||||
|
op.execute(
|
||||||
|
"CREATE INDEX ix_movies_title_normalized_trgm ON movies "
|
||||||
|
"USING GIN (title_normalized gin_trgm_ops);"
|
||||||
|
)
|
||||||
|
op.create_index("ix_movies_release_year", "movies", ["release_year"])
|
||||||
|
op.create_index("ix_movies_release_date", "movies", ["release_date"])
|
||||||
|
op.create_index("ix_movies_slug", "movies", ["slug"])
|
||||||
|
op.create_index("ix_movies_studio_id", "movies", ["studio_id"])
|
||||||
|
op.create_index("ix_movies_studio_year", "movies", ["studio_id", "release_year"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"movie_external_refs",
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("external_id", sa.String(256), primary_key=True),
|
||||||
|
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("confidence", sa.Float, nullable=False, server_default="1.0"),
|
||||||
|
sa.Column("url", sa.String(1024)),
|
||||||
|
sa.Column("first_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("last_seen", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_movie_external_refs_movie_id", "movie_external_refs", ["movie_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"movie_performers",
|
||||||
|
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("performer_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("performers.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("role", sa.String(64)),
|
||||||
|
sa.Column("position", sa.Integer),
|
||||||
|
sa.Column("as_alias", sa.String(256)),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"movie_tags",
|
||||||
|
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("tag_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("tags.id", ondelete="CASCADE"), primary_key=True),
|
||||||
|
sa.Column("source_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("sources.id", ondelete="SET NULL")),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chaptery — opcjonalna tabela dla filmów rozbitych na sceny/segmenty
|
||||||
|
# (paradisehill czasem ma timestamp markers, np. "Scene 1: 00:00-15:32").
|
||||||
|
# Każdy chapter MOŻE linkować do istniejącego Scene (jeśli ta scena też jest
|
||||||
|
# samodzielnie znana z TPDB/StashDB), albo żyje tylko jako anchor w movie.
|
||||||
|
op.create_table(
|
||||||
|
"movie_chapters",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("chapter_index", sa.Integer, nullable=False),
|
||||||
|
sa.Column("title", sa.String(512)),
|
||||||
|
sa.Column("start_sec", sa.Integer),
|
||||||
|
sa.Column("end_sec", sa.Integer),
|
||||||
|
sa.Column("scene_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("scenes.id", ondelete="SET NULL")),
|
||||||
|
sa.UniqueConstraint("movie_id", "chapter_index", name="uq_movie_chapters_movie_id_chapter_index"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_movie_chapters_movie_id", "movie_chapters", ["movie_id"])
|
||||||
|
|
||||||
|
# Playback sources dla movies — analog do playback_sources, oddzielna tabela
|
||||||
|
# bo nie chcemy mieszać scene_id/movie_id w jednym FK column. Reuse origin
|
||||||
|
# konwencji ('paradisehill', 'psyplay:streamporn', 'wp_movies:speedporn', itp.).
|
||||||
|
op.create_table(
|
||||||
|
"movie_playback_sources",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True, server_default=sa.text("gen_random_uuid()")),
|
||||||
|
sa.Column("movie_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("movies.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
sa.Column("origin", sa.String(64), nullable=False),
|
||||||
|
sa.Column("page_url", sa.String(2048), nullable=False),
|
||||||
|
sa.Column("embed_url", sa.String(2048)),
|
||||||
|
sa.Column("stream_url", sa.String(2048)),
|
||||||
|
sa.Column("quality", sa.String(16)),
|
||||||
|
sa.Column("duration_sec", sa.Integer),
|
||||||
|
sa.Column("thumbnail_url", sa.String(2048)),
|
||||||
|
sa.Column("animated_thumbnail_url", sa.String(2048)),
|
||||||
|
sa.Column("last_seen_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("dead_at", sa.DateTime(timezone=True)),
|
||||||
|
sa.Column("dead_reason", sa.String(512)),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("NOW()"), nullable=False),
|
||||||
|
sa.UniqueConstraint("origin", "page_url", name="uq_movie_playback_sources_origin_page_url"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_movie_playback_sources_movie_id", "movie_playback_sources", ["movie_id"])
|
||||||
|
op.create_index("ix_movie_playback_sources_origin", "movie_playback_sources", ["origin"])
|
||||||
|
op.create_index("ix_movie_playback_sources_dead_at", "movie_playback_sources", ["dead_at"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("movie_playback_sources")
|
||||||
|
op.drop_table("movie_chapters")
|
||||||
|
op.drop_table("movie_tags")
|
||||||
|
op.drop_table("movie_performers")
|
||||||
|
op.drop_table("movie_external_refs")
|
||||||
|
op.drop_table("movies")
|
||||||
|
# Postgres nie pozwala usuwać wartości z enum-a w prosty sposób — zostawiamy
|
||||||
|
# 'movie' w entity_kind / merge_kind. Niewielki overhead w katalogu enum-ów
|
||||||
|
# (rząd bajtów per typ), bezpieczniejsze niż próby DROP VALUE.
|
||||||
39
alembic/versions/20260506_0010_favorite_scenes.py
Normal file
39
alembic/versions/20260506_0010_favorite_scenes.py
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
"""favorite_scenes table
|
||||||
|
|
||||||
|
Revision ID: 0010_favorite_scenes
|
||||||
|
Revises: 0009_movies
|
||||||
|
Create Date: 2026-05-06
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0010_favorite_scenes"
|
||||||
|
down_revision: str | None = "0009_movies"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"favorite_scenes",
|
||||||
|
sa.Column(
|
||||||
|
"scene_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("scenes.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("NOW()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
op.create_index("ix_favorite_scenes_created_at", "favorite_scenes", ["created_at"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("favorite_scenes")
|
||||||
60
alembic/versions/20260507_0011_origin_pornapp_to_tube.py
Normal file
60
alembic/versions/20260507_0011_origin_pornapp_to_tube.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
"""playback_sources.origin: rename `pornapp:*` → `tube:*`
|
||||||
|
|
||||||
|
Revision ID: 0011_origin_pornapp_to_tube
|
||||||
|
Revises: 0010_favorite_scenes
|
||||||
|
Create Date: 2026-05-07
|
||||||
|
|
||||||
|
Po usunięciu zależności od porn-app.com API, prefix `pornapp:` w `playback_sources.origin`
|
||||||
|
jest myląca historyczna nazwa — discovery + stream resolve teraz idzie bezpośrednio przez
|
||||||
|
direct scrapery i `app.extractors`. Zmieniamy prefix na neutralny `tube:` żeby nazwa
|
||||||
|
odzwierciedlała architekturę (sitetag pozostaje bez zmian — `tube:hqpornercom` itd.).
|
||||||
|
|
||||||
|
Idempotent: WHERE klauzula zapobiega podwójnemu rename. Operuje też na
|
||||||
|
`movie_playback_sources` (analogiczna kolumna z M5 movies).
|
||||||
|
|
||||||
|
Backend `app/api/playback.py` rozumie oba prefixy (`pornapp:` legacy + `tube:`)
|
||||||
|
podczas okresu transition — po tej migracji można pozostawić tylko `tube:` sprawdzenie.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0011_origin_pornapp_to_tube"
|
||||||
|
down_revision: str | None = "0010_favorite_scenes"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.execute(
|
||||||
|
sa.text(
|
||||||
|
"UPDATE playback_sources "
|
||||||
|
"SET origin = 'tube:' || SUBSTRING(origin FROM 9) "
|
||||||
|
"WHERE origin LIKE 'pornapp:%'"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
op.execute(
|
||||||
|
sa.text(
|
||||||
|
"UPDATE movie_playback_sources "
|
||||||
|
"SET origin = 'tube:' || SUBSTRING(origin FROM 9) "
|
||||||
|
"WHERE origin LIKE 'pornapp:%'"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.execute(
|
||||||
|
sa.text(
|
||||||
|
"UPDATE playback_sources "
|
||||||
|
"SET origin = 'pornapp:' || SUBSTRING(origin FROM 6) "
|
||||||
|
"WHERE origin LIKE 'tube:%'"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
op.execute(
|
||||||
|
sa.text(
|
||||||
|
"UPDATE movie_playback_sources "
|
||||||
|
"SET origin = 'pornapp:' || SUBSTRING(origin FROM 6) "
|
||||||
|
"WHERE origin LIKE 'tube:%'"
|
||||||
|
)
|
||||||
|
)
|
||||||
48
alembic/versions/20260508_0012_favorite_studios.py
Normal file
48
alembic/versions/20260508_0012_favorite_studios.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
"""favorite_studios — ulubione studia (single-user, in-app)
|
||||||
|
|
||||||
|
Revision ID: 0012_favorite_studios
|
||||||
|
Revises: 0011_origin_pornapp_to_tube
|
||||||
|
Create Date: 2026-05-08
|
||||||
|
|
||||||
|
Mirror `favorite_performers` ze studio_id zamiast performer_id. Single-user, więc
|
||||||
|
tabelka to po prostu zbiór studio_id które user oznaczył jako ulubione, plus
|
||||||
|
`last_seen_at` — mobile liczy ile nowych scen pojawiło się w danym studio od
|
||||||
|
ostatniego oglądania (badge w Favorites).
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0012_favorite_studios"
|
||||||
|
down_revision: str | None = "0011_origin_pornapp_to_tube"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"favorite_studios",
|
||||||
|
sa.Column(
|
||||||
|
"studio_id",
|
||||||
|
sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("studios.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"last_seen_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("favorite_studios")
|
||||||
52
alembic/versions/20260509_0013_bug_reports.py
Normal file
52
alembic/versions/20260509_0013_bug_reports.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
"""bug_reports — in-app bug reporting (mobile FAB → POST /bug-reports)
|
||||||
|
|
||||||
|
Revision ID: 0013_bug_reports
|
||||||
|
Revises: 0012_favorite_studios
|
||||||
|
Create Date: 2026-05-09
|
||||||
|
|
||||||
|
User wpisuje opis + appka kapturuje screen (react-native-view-shot omija
|
||||||
|
FLAG_SECURE) → wysyła POST. Backend trzyma w tabeli, admin_html ma listę.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0013_bug_reports"
|
||||||
|
down_revision: str | None = "0012_favorite_studios"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"bug_reports",
|
||||||
|
sa.Column(
|
||||||
|
"id",
|
||||||
|
sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
primary_key=True,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column("screen_name", sa.String(64), nullable=True),
|
||||||
|
sa.Column("app_version", sa.String(32), nullable=True),
|
||||||
|
sa.Column(
|
||||||
|
"scene_id",
|
||||||
|
sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("scenes.id", ondelete="SET NULL"),
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
sa.Column("message", sa.Text, nullable=False),
|
||||||
|
sa.Column("screenshot_b64", sa.Text, nullable=True),
|
||||||
|
sa.Column("resolved", sa.Boolean, nullable=False, server_default=sa.false()),
|
||||||
|
)
|
||||||
|
op.create_index("ix_bug_reports_created_at", "bug_reports", ["created_at"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_bug_reports_created_at", table_name="bug_reports")
|
||||||
|
op.drop_table("bug_reports")
|
||||||
46
alembic/versions/20260509_0014_favorite_movies.py
Normal file
46
alembic/versions/20260509_0014_favorite_movies.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
"""favorite_movies — single-user favorites + last_seen_at dla NEW badge.
|
||||||
|
|
||||||
|
Revision ID: 0014_favorite_movies
|
||||||
|
Revises: 0013_bug_reports
|
||||||
|
Create Date: 2026-05-09
|
||||||
|
|
||||||
|
Mirror `favorite_studios` z movie_id zamiast studio_id. NEW badge w mobile
|
||||||
|
liczone client-side: movie.created_at > favorite.last_seen_at.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0014_favorite_movies"
|
||||||
|
down_revision: str | None = "0013_bug_reports"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"favorite_movies",
|
||||||
|
sa.Column(
|
||||||
|
"movie_id",
|
||||||
|
sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("movies.id", ondelete="CASCADE"),
|
||||||
|
primary_key=True,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"last_seen_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.func.now(),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("favorite_movies")
|
||||||
38
alembic/versions/20260510_0015_bug_reports_movie_id.py
Normal file
38
alembic/versions/20260510_0015_bug_reports_movie_id.py
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
"""bug_reports — dodaj movie_id (FK movies, nullable)
|
||||||
|
|
||||||
|
Revision ID: 0015_bug_reports_movie_id
|
||||||
|
Revises: 0014_favorite_movies
|
||||||
|
Create Date: 2026-05-10
|
||||||
|
|
||||||
|
Mobile Player przekazuje movie_id w nav params jako `sceneId` (legacy hack na
|
||||||
|
progress tracking, który dla movies zwraca 404 i mobile to ignoruje). Bug-report
|
||||||
|
flow inserted to przy POST jako scene_id, FK violation crash → 500.
|
||||||
|
|
||||||
|
Fix: rozszerz tabelę o movie_id, backend smart-routes po lookup (jeśli scene_id
|
||||||
|
nie istnieje w scenes ALE istnieje w movies, zapisz jako movie_id).
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0015_bug_reports_movie_id"
|
||||||
|
down_revision: str | None = "0014_favorite_movies"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.add_column(
|
||||||
|
"bug_reports",
|
||||||
|
sa.Column(
|
||||||
|
"movie_id",
|
||||||
|
sa.dialects.postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("movies.id", ondelete="SET NULL"),
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_column("bug_reports", "movie_id")
|
||||||
44
alembic/versions/20260512_0016_realdebrid_cache.py
Normal file
44
alembic/versions/20260512_0016_realdebrid_cache.py
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
"""realdebrid_cache — direct stream URL cache dla RD /unrestrict/link wyników
|
||||||
|
|
||||||
|
Revision ID: 0016_realdebrid_cache
|
||||||
|
Revises: 0015_bug_reports_movie_id
|
||||||
|
Create Date: 2026-05-12
|
||||||
|
|
||||||
|
RD direct linki technically valid ~7 dni, ale cache'ujemy 24h (configurable
|
||||||
|
przez RD_CACHE_TTL_HOURS) żeby oszczędzać API quota przy replay tej samej
|
||||||
|
sceny.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0016_realdebrid_cache"
|
||||||
|
down_revision: str | None = "0015_bug_reports_movie_id"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"realdebrid_cache",
|
||||||
|
sa.Column("hoster_url", sa.Text(), primary_key=True),
|
||||||
|
sa.Column("direct_url", sa.Text(), nullable=False),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.TIMESTAMP(timezone=True),
|
||||||
|
nullable=False,
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
),
|
||||||
|
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"ix_realdebrid_cache_expires_at",
|
||||||
|
"realdebrid_cache",
|
||||||
|
["expires_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_realdebrid_cache_expires_at", table_name="realdebrid_cache")
|
||||||
|
op.drop_table("realdebrid_cache")
|
||||||
37
alembic/versions/20260512_0017_drop_realdebrid_cache.py
Normal file
37
alembic/versions/20260512_0017_drop_realdebrid_cache.py
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
"""drop realdebrid_cache table — RD nie wykorzystywany (Hetzner IP blocked)
|
||||||
|
|
||||||
|
Revision ID: 0017_drop_realdebrid_cache
|
||||||
|
Revises: 0016_realdebrid_cache
|
||||||
|
Create Date: 2026-05-12
|
||||||
|
|
||||||
|
Real-Debrid integration cofnięta — Hetzner VPS IP blokowany globalnie przez
|
||||||
|
RD anti-abuse, a 95% relevantnych hosterów (streamtape/playmogo/dood/mixdrop/
|
||||||
|
filemoon/iceyfile) są DOWN lub UNSUPPORTED w RD list. Tylko voe.sx + file
|
||||||
|
hosters UP, nie pokrywa naszego streaming use case.
|
||||||
|
"""
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0017_drop_realdebrid_cache"
|
||||||
|
down_revision: str | None = "0016_realdebrid_cache"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.drop_index("ix_realdebrid_cache_expires_at", table_name="realdebrid_cache")
|
||||||
|
op.drop_table("realdebrid_cache")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"realdebrid_cache",
|
||||||
|
sa.Column("hoster_url", sa.Text(), primary_key=True),
|
||||||
|
sa.Column("direct_url", sa.Text(), nullable=False),
|
||||||
|
sa.Column("created_at", sa.TIMESTAMP(timezone=True), nullable=False,
|
||||||
|
server_default=sa.text("now()")),
|
||||||
|
sa.Column("expires_at", sa.TIMESTAMP(timezone=True), nullable=False),
|
||||||
|
)
|
||||||
|
op.create_index("ix_realdebrid_cache_expires_at", "realdebrid_cache", ["expires_at"])
|
||||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
332
app/api/admin.py
Normal file
332
app/api/admin.py
Normal file
|
|
@ -0,0 +1,332 @@
|
||||||
|
"""Admin API: lista pending merge candidates + side-by-side detail + resolve."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated, Literal
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
from sqlalchemy import func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.scenes import _build_scene_out
|
||||||
|
from app.api.schemas import SceneOut
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.external_record import ExternalRecord
|
||||||
|
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import Scene, SceneExternalRef
|
||||||
|
from app.models.source import Source, SourceKind
|
||||||
|
from app.resolve.scene_merge import MergeError, resolve_candidate
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/admin",
|
||||||
|
tags=["admin"],
|
||||||
|
dependencies=[Depends(require_api_key)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _raw_to_thumb(raw: dict, kind: SourceKind) -> str | None:
|
||||||
|
"""Wyciąga thumbnail URL z external_records.raw dla danego źródła.
|
||||||
|
TPDB ma `image`/`poster`/`background.large`. StashDB raw nie zawiera image
|
||||||
|
(osobny query do StashDB potrzebny — tu zwracamy None)."""
|
||||||
|
if kind == SourceKind.tpdb:
|
||||||
|
for k in ("image", "poster"):
|
||||||
|
v = raw.get(k)
|
||||||
|
if isinstance(v, str) and v.startswith("http"):
|
||||||
|
return v
|
||||||
|
bg = raw.get("background")
|
||||||
|
if isinstance(bg, dict):
|
||||||
|
v = bg.get("large") or bg.get("medium") or bg.get("full")
|
||||||
|
if isinstance(v, str) and v.startswith("http"):
|
||||||
|
return v
|
||||||
|
elif kind == SourceKind.stashdb:
|
||||||
|
# StashDB scene response includes images via separate query — nie trzymamy
|
||||||
|
# tego w raw obecnie. TODO: dorzucić mirror do `paths.screenshot` przy ingest.
|
||||||
|
paths = raw.get("paths")
|
||||||
|
if isinstance(paths, dict):
|
||||||
|
for k in ("screenshot", "image", "preview"):
|
||||||
|
v = paths.get(k)
|
||||||
|
if isinstance(v, str) and v.startswith("http"):
|
||||||
|
return v
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---- schemas --------------------------------------------------------------
|
||||||
|
|
||||||
|
class MergeCandidateSummary(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
kind: str
|
||||||
|
left_id: uuid.UUID
|
||||||
|
right_id: uuid.UUID
|
||||||
|
score: float
|
||||||
|
status: str
|
||||||
|
left_title: str | None = None
|
||||||
|
right_title: str | None = None
|
||||||
|
left_thumbnail_url: str | None = None
|
||||||
|
left_animated_thumbnail_url: str | None = None
|
||||||
|
right_thumbnail_url: str | None = None
|
||||||
|
right_animated_thumbnail_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class MergeCandidateListOut(BaseModel):
|
||||||
|
items: list[MergeCandidateSummary]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
|
|
||||||
|
|
||||||
|
class MergeCandidateDetail(BaseModel):
|
||||||
|
id: uuid.UUID
|
||||||
|
kind: str
|
||||||
|
score: float
|
||||||
|
status: str
|
||||||
|
reasons: dict
|
||||||
|
left: SceneOut | None
|
||||||
|
right: SceneOut | None
|
||||||
|
|
||||||
|
|
||||||
|
class ResolveBody(BaseModel):
|
||||||
|
action: Literal["merge", "reject"]
|
||||||
|
keep: Literal["left", "right"] = "left"
|
||||||
|
resolved_by: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ResolveResult(BaseModel):
|
||||||
|
id: uuid.UUID
|
||||||
|
status: str
|
||||||
|
keep_id: uuid.UUID | None = None
|
||||||
|
drop_id: uuid.UUID | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---- endpoints ------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.get("/merge-candidates", response_model=MergeCandidateListOut)
|
||||||
|
def list_candidates(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
status: Annotated[str, Query(pattern="^(pending|auto_merged|merged|rejected|all)$")] = "pending",
|
||||||
|
kind: Annotated[str, Query(pattern="^(scene|performer|studio|all)$")] = "scene",
|
||||||
|
page: Annotated[int, Query(ge=1)] = 1,
|
||||||
|
per_page: Annotated[int, Query(ge=1, le=200)] = 50,
|
||||||
|
) -> MergeCandidateListOut:
|
||||||
|
base = select(MergeCandidate)
|
||||||
|
if status != "all":
|
||||||
|
base = base.where(MergeCandidate.status == MergeStatus(status))
|
||||||
|
if kind != "all":
|
||||||
|
base = base.where(MergeCandidate.kind == MergeKind(kind))
|
||||||
|
|
||||||
|
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
|
||||||
|
|
||||||
|
rows = (
|
||||||
|
session.execute(
|
||||||
|
base.order_by(MergeCandidate.score.desc(), MergeCandidate.created_at.desc())
|
||||||
|
.offset((page - 1) * per_page)
|
||||||
|
.limit(per_page)
|
||||||
|
)
|
||||||
|
.scalars()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pre-fetch tytułów scen (gdy kind=scene) dla wygodnego podglądu
|
||||||
|
titles: dict[uuid.UUID, str] = {}
|
||||||
|
scene_ids = {r.left_id for r in rows if r.kind == MergeKind.scene} | {
|
||||||
|
r.right_id for r in rows if r.kind == MergeKind.scene
|
||||||
|
}
|
||||||
|
if scene_ids:
|
||||||
|
for sid, title in session.execute(
|
||||||
|
select(Scene.id, Scene.title).where(Scene.id.in_(scene_ids))
|
||||||
|
):
|
||||||
|
titles[sid] = title
|
||||||
|
|
||||||
|
# Pre-fetch po jednym statycznym i animowanym thumbnailu per scenę (mobile queue
|
||||||
|
# używa statycznego do listy + animowanego po hold-to-preview). Wybieramy najpierw
|
||||||
|
# napotkany niepusty URL — kolejność rzędów playback_sources nie jest gwarantowana,
|
||||||
|
# ale dla triage to wystarcza.
|
||||||
|
thumbs: dict[uuid.UUID, str] = {}
|
||||||
|
animated_thumbs: dict[uuid.UUID, str] = {}
|
||||||
|
if scene_ids:
|
||||||
|
for sid, static_url, animated_url in session.execute(
|
||||||
|
select(
|
||||||
|
PlaybackSource.scene_id,
|
||||||
|
PlaybackSource.thumbnail_url,
|
||||||
|
PlaybackSource.animated_thumbnail_url,
|
||||||
|
).where(PlaybackSource.scene_id.in_(scene_ids))
|
||||||
|
):
|
||||||
|
if static_url and sid not in thumbs:
|
||||||
|
thumbs[sid] = static_url
|
||||||
|
if animated_url and sid not in animated_thumbs:
|
||||||
|
animated_thumbs[sid] = animated_url
|
||||||
|
|
||||||
|
# Fallback: dla scen TPDB/StashDB-only (brak playback_source) wyciągamy
|
||||||
|
# poster URL z external_records.raw['image' | 'poster' | 'paths.screenshot'].
|
||||||
|
# Bez tego merge queue ma 70%+ wpisów bez thumb (canonical TPDB↔StashDB pary).
|
||||||
|
missing = [sid for sid in scene_ids if sid not in thumbs]
|
||||||
|
if missing:
|
||||||
|
ext_rows = session.execute(
|
||||||
|
select(SceneExternalRef.scene_id, ExternalRecord.raw, Source.kind)
|
||||||
|
.join(
|
||||||
|
ExternalRecord,
|
||||||
|
(ExternalRecord.source_id == SceneExternalRef.source_id)
|
||||||
|
& (ExternalRecord.external_id == SceneExternalRef.external_id),
|
||||||
|
)
|
||||||
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
||||||
|
.where(SceneExternalRef.scene_id.in_(missing))
|
||||||
|
.where(ExternalRecord.entity_kind == "scene")
|
||||||
|
).all()
|
||||||
|
for sid, raw, kind in ext_rows:
|
||||||
|
if sid in thumbs or not isinstance(raw, dict):
|
||||||
|
continue
|
||||||
|
url = _raw_to_thumb(raw, kind)
|
||||||
|
if url:
|
||||||
|
thumbs[sid] = url
|
||||||
|
|
||||||
|
items = [
|
||||||
|
MergeCandidateSummary(
|
||||||
|
id=r.id,
|
||||||
|
kind=r.kind.value,
|
||||||
|
left_id=r.left_id,
|
||||||
|
right_id=r.right_id,
|
||||||
|
score=r.score,
|
||||||
|
status=r.status.value,
|
||||||
|
left_title=titles.get(r.left_id),
|
||||||
|
right_title=titles.get(r.right_id),
|
||||||
|
left_thumbnail_url=thumbs.get(r.left_id),
|
||||||
|
right_thumbnail_url=thumbs.get(r.right_id),
|
||||||
|
left_animated_thumbnail_url=animated_thumbs.get(r.left_id),
|
||||||
|
right_animated_thumbnail_url=animated_thumbs.get(r.right_id),
|
||||||
|
)
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
return MergeCandidateListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/merge-candidates/{candidate_id}", response_model=MergeCandidateDetail)
|
||||||
|
def get_candidate(
|
||||||
|
candidate_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> MergeCandidateDetail:
|
||||||
|
cand = session.get(MergeCandidate, candidate_id)
|
||||||
|
if cand is None:
|
||||||
|
raise HTTPException(status_code=404, detail="merge candidate not found")
|
||||||
|
|
||||||
|
left_out = right_out = None
|
||||||
|
if cand.kind == MergeKind.scene:
|
||||||
|
left_scene = session.get(Scene, cand.left_id)
|
||||||
|
right_scene = session.get(Scene, cand.right_id)
|
||||||
|
if left_scene is not None:
|
||||||
|
left_out = _build_scene_out(session, left_scene)
|
||||||
|
if right_scene is not None and right_scene.id != cand.left_id:
|
||||||
|
right_out = _build_scene_out(session, right_scene)
|
||||||
|
|
||||||
|
return MergeCandidateDetail(
|
||||||
|
id=cand.id,
|
||||||
|
kind=cand.kind.value,
|
||||||
|
score=cand.score,
|
||||||
|
status=cand.status.value,
|
||||||
|
reasons=cand.reasons or {},
|
||||||
|
left=left_out,
|
||||||
|
right=right_out,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/merge-candidates/{candidate_id}/resolve", response_model=ResolveResult)
|
||||||
|
def resolve(
|
||||||
|
candidate_id: uuid.UUID,
|
||||||
|
body: ResolveBody,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> ResolveResult:
|
||||||
|
try:
|
||||||
|
cand = resolve_candidate(
|
||||||
|
session,
|
||||||
|
candidate_id=candidate_id,
|
||||||
|
action=body.action,
|
||||||
|
keep_left=(body.keep == "left"),
|
||||||
|
resolved_by=body.resolved_by,
|
||||||
|
)
|
||||||
|
except MergeError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
keep_id = drop_id = None
|
||||||
|
if body.action == "merge":
|
||||||
|
keep_id = cand.left_id if body.keep == "left" else cand.right_id
|
||||||
|
drop_id = cand.right_id if body.keep == "left" else cand.left_id
|
||||||
|
|
||||||
|
return ResolveResult(id=cand.id, status=cand.status.value, keep_id=keep_id, drop_id=drop_id)
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Bandwidth monitor -----------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class BandwidthCdnRow(BaseModel):
|
||||||
|
cdn: str
|
||||||
|
bytes: int
|
||||||
|
pretty: str
|
||||||
|
|
||||||
|
|
||||||
|
class BandwidthStats(BaseModel):
|
||||||
|
"""Per-CDN bytes-out z VPS proxy (rolling buckets). Restart api resetuje.
|
||||||
|
Hetzner widoczne tylko gdy HETZNER_API_TOKEN i HETZNER_SERVER_ID w env."""
|
||||||
|
last_1h: list[BandwidthCdnRow]
|
||||||
|
last_24h: list[BandwidthCdnRow]
|
||||||
|
last_7d: list[BandwidthCdnRow]
|
||||||
|
total_bytes_1h: int
|
||||||
|
total_bytes_24h: int
|
||||||
|
total_bytes_7d: int
|
||||||
|
hetzner: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_bytes(b: int) -> str:
|
||||||
|
if b < 1024:
|
||||||
|
return f"{b} B"
|
||||||
|
val = float(b)
|
||||||
|
for u in ("KB", "MB", "GB", "TB"):
|
||||||
|
val /= 1024
|
||||||
|
if val < 1024:
|
||||||
|
return f"{val:.2f} {u}"
|
||||||
|
return f"{val:.2f} PB"
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/bandwidth", response_model=BandwidthStats)
|
||||||
|
def bandwidth_stats() -> BandwidthStats:
|
||||||
|
"""Per-CDN VPS proxy bytes-out + Hetzner traffic stats.
|
||||||
|
|
||||||
|
Critical dla public release — pokazuje gdzie VPS bandwidth wycieka. Pozwala
|
||||||
|
spotted Mixdrop / bandwidth-heavy CDN-y przed Hetzner overage charge.
|
||||||
|
"""
|
||||||
|
from app.api.stream_proxy import get_bandwidth_stats
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
def _rows(stats: dict[str, int]) -> list[BandwidthCdnRow]:
|
||||||
|
return [
|
||||||
|
BandwidthCdnRow(cdn=cdn, bytes=b, pretty=_fmt_bytes(b))
|
||||||
|
for cdn, b in stats.items()
|
||||||
|
]
|
||||||
|
|
||||||
|
s_1h = get_bandwidth_stats(1)
|
||||||
|
s_24h = get_bandwidth_stats(24)
|
||||||
|
s_7d = get_bandwidth_stats(168)
|
||||||
|
|
||||||
|
# Hetzner stats — load from cache file (written by check_hetzner_traffic.py cron).
|
||||||
|
hetzner_data = None
|
||||||
|
settings = get_settings()
|
||||||
|
if settings.hetzner_api_token and settings.hetzner_server_id:
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
cache_path = Path("/tmp/hetzner_traffic.json")
|
||||||
|
if cache_path.exists():
|
||||||
|
try:
|
||||||
|
hetzner_data = json.loads(cache_path.read_text())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return BandwidthStats(
|
||||||
|
last_1h=_rows(s_1h),
|
||||||
|
last_24h=_rows(s_24h),
|
||||||
|
last_7d=_rows(s_7d),
|
||||||
|
total_bytes_1h=sum(s_1h.values()),
|
||||||
|
total_bytes_24h=sum(s_24h.values()),
|
||||||
|
total_bytes_7d=sum(s_7d.values()),
|
||||||
|
hetzner=hetzner_data,
|
||||||
|
)
|
||||||
206
app/api/admin_html.py
Normal file
206
app/api/admin_html.py
Normal file
|
|
@ -0,0 +1,206 @@
|
||||||
|
"""htmx + Jinja2 admin UI dla MergeCandidate triage.
|
||||||
|
|
||||||
|
Endpointy:
|
||||||
|
GET /ui/ — lista pending (filter status)
|
||||||
|
GET /ui/candidate/{id} — side-by-side scen
|
||||||
|
POST /ui/candidate/{id}/resolve — htmx form submit (action=merge_keep_left|merge_keep_right|reject)
|
||||||
|
zwraca fragment HTML z potwierdzeniem
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from sqlalchemy import func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.scenes import _build_scene_out
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.merge_candidate import MergeCandidate, MergeKind, MergeStatus
|
||||||
|
from app.models.scene import Scene
|
||||||
|
from app.resolve.scene_merge import MergeError, resolve_candidate
|
||||||
|
|
||||||
|
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
|
||||||
|
_STATIC_DIR = Path(__file__).resolve().parent.parent / "static"
|
||||||
|
|
||||||
|
templates = Jinja2Templates(directory=str(_TEMPLATES_DIR))
|
||||||
|
|
||||||
|
|
||||||
|
def _score_class(score: float) -> str:
|
||||||
|
if score >= 0.92:
|
||||||
|
return "high"
|
||||||
|
if score >= 0.75:
|
||||||
|
return "mid"
|
||||||
|
return "low"
|
||||||
|
|
||||||
|
|
||||||
|
templates.env.globals["score_class"] = _score_class
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/ui",
|
||||||
|
tags=["ui"],
|
||||||
|
dependencies=[Depends(require_api_key)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/", response_class=HTMLResponse)
|
||||||
|
def list_view(
|
||||||
|
request: Request,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
status: Annotated[str, Query(pattern="^(pending|auto_merged|merged|rejected|all)$")] = "pending",
|
||||||
|
page: Annotated[int, Query(ge=1)] = 1,
|
||||||
|
) -> HTMLResponse:
|
||||||
|
per_page = 50
|
||||||
|
|
||||||
|
base = select(MergeCandidate).where(MergeCandidate.kind == MergeKind.scene)
|
||||||
|
if status != "all":
|
||||||
|
base = base.where(MergeCandidate.status == MergeStatus(status))
|
||||||
|
|
||||||
|
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
|
||||||
|
rows = (
|
||||||
|
session.execute(
|
||||||
|
base.order_by(MergeCandidate.score.desc(), MergeCandidate.created_at.desc())
|
||||||
|
.offset((page - 1) * per_page)
|
||||||
|
.limit(per_page)
|
||||||
|
)
|
||||||
|
.scalars()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
titles: dict[uuid.UUID, str] = {}
|
||||||
|
scene_ids = {r.left_id for r in rows} | {r.right_id for r in rows}
|
||||||
|
if scene_ids:
|
||||||
|
for sid, title in session.execute(
|
||||||
|
select(Scene.id, Scene.title).where(Scene.id.in_(scene_ids))
|
||||||
|
):
|
||||||
|
titles[sid] = title
|
||||||
|
|
||||||
|
items = [
|
||||||
|
{
|
||||||
|
"id": r.id,
|
||||||
|
"kind": r.kind.value,
|
||||||
|
"left_id": r.left_id,
|
||||||
|
"right_id": r.right_id,
|
||||||
|
"score": r.score,
|
||||||
|
"status": r.status.value,
|
||||||
|
"left_title": titles.get(r.left_id),
|
||||||
|
"right_title": titles.get(r.right_id),
|
||||||
|
}
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
label_map = {
|
||||||
|
"pending": "Pending",
|
||||||
|
"auto_merged": "Auto-merged",
|
||||||
|
"merged": "Merged",
|
||||||
|
"rejected": "Rejected",
|
||||||
|
"all": "All",
|
||||||
|
}
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request,
|
||||||
|
"candidates_list.html",
|
||||||
|
{
|
||||||
|
"items": items,
|
||||||
|
"total": total,
|
||||||
|
"page": page,
|
||||||
|
"per_page": per_page,
|
||||||
|
"status": status,
|
||||||
|
"status_label": label_map[status],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/candidate/{candidate_id}", response_class=HTMLResponse)
|
||||||
|
def detail_view(
|
||||||
|
candidate_id: uuid.UUID,
|
||||||
|
request: Request,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> HTMLResponse:
|
||||||
|
cand = session.get(MergeCandidate, candidate_id)
|
||||||
|
if cand is None:
|
||||||
|
raise HTTPException(status_code=404, detail="merge candidate not found")
|
||||||
|
|
||||||
|
left_out = right_out = None
|
||||||
|
if cand.kind == MergeKind.scene:
|
||||||
|
left_scene = session.get(Scene, cand.left_id)
|
||||||
|
right_scene = session.get(Scene, cand.right_id)
|
||||||
|
if left_scene is not None:
|
||||||
|
left_out = _build_scene_out(session, left_scene)
|
||||||
|
if right_scene is not None and right_scene.id != cand.left_id:
|
||||||
|
right_out = _build_scene_out(session, right_scene)
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request,
|
||||||
|
"candidate_detail.html",
|
||||||
|
{
|
||||||
|
"cand": {
|
||||||
|
"id": cand.id,
|
||||||
|
"kind": cand.kind.value,
|
||||||
|
"score": cand.score,
|
||||||
|
"status": cand.status.value,
|
||||||
|
"reasons": cand.reasons or {},
|
||||||
|
"left": left_out,
|
||||||
|
"right": right_out,
|
||||||
|
"left_id": cand.left_id,
|
||||||
|
"right_id": cand.right_id,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/candidate/{candidate_id}/resolve", response_class=HTMLResponse)
|
||||||
|
def resolve_form(
|
||||||
|
candidate_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
action: Annotated[str, Form()],
|
||||||
|
) -> HTMLResponse:
|
||||||
|
if action not in {"merge_keep_left", "merge_keep_right", "reject"}:
|
||||||
|
raise HTTPException(status_code=400, detail=f"invalid action: {action}")
|
||||||
|
|
||||||
|
api_action = "reject" if action == "reject" else "merge"
|
||||||
|
keep_left = action != "merge_keep_right"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resolve_candidate(
|
||||||
|
session,
|
||||||
|
candidate_id=candidate_id,
|
||||||
|
action=api_action,
|
||||||
|
keep_left=keep_left,
|
||||||
|
resolved_by="ui",
|
||||||
|
)
|
||||||
|
except MergeError as exc:
|
||||||
|
return HTMLResponse(
|
||||||
|
f'<div class="card" id="actions" style="border-color: var(--bad);">'
|
||||||
|
f"<strong>error:</strong> {exc}</div>",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
label = {
|
||||||
|
"merge_keep_left": "Merged into LEFT",
|
||||||
|
"merge_keep_right": "Merged into RIGHT",
|
||||||
|
"reject": "Rejected (kept both)",
|
||||||
|
}[action]
|
||||||
|
|
||||||
|
return HTMLResponse(
|
||||||
|
f'<div class="card" id="actions" style="border-color: var(--good);">'
|
||||||
|
f"<strong>{label}.</strong> "
|
||||||
|
f'<a href="/ui/">← back to list</a></div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mount_static(app) -> None: # pragma: no cover - dev convenience
|
||||||
|
# APK MIME type — bez tego Android Browser nie traktuje pliku jako instalable APK
|
||||||
|
# (text/plain → "Plik został pobrany" zamiast prompta install). Rejestracja jest
|
||||||
|
# idempotentna na poziomie procesu — bezpiecznie wywoływać przy każdym startup.
|
||||||
|
import mimetypes
|
||||||
|
mimetypes.add_type("application/vnd.android.package-archive", ".apk")
|
||||||
|
if _STATIC_DIR.exists():
|
||||||
|
app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static")
|
||||||
116
app/api/blacklist.py
Normal file
116
app/api/blacklist.py
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
"""Blacklists — globalnie ukryte performerki/studia/tagi.
|
||||||
|
|
||||||
|
Sceny które MAJĄ blacklisted entity wypadają z każdego /scenes (pełna lista, search,
|
||||||
|
performer scenes, tag scenes). Auto-apply w `app/api/scenes.py`.
|
||||||
|
|
||||||
|
Endpointy:
|
||||||
|
GET /blacklist — wszystkie 3 listy w jednym response
|
||||||
|
POST /blacklist/{kind}/{entity_id} — dodaj (idempotent)
|
||||||
|
DELETE /blacklist/{kind}/{entity_id} — usuń
|
||||||
|
|
||||||
|
`kind` ∈ {performer, studio, tag}.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated, Literal
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.blacklist import (
|
||||||
|
BlacklistedPerformer,
|
||||||
|
BlacklistedStudio,
|
||||||
|
BlacklistedTag,
|
||||||
|
)
|
||||||
|
from app.models.performer import Performer
|
||||||
|
from app.models.studio import Studio
|
||||||
|
from app.models.tag import Tag
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/blacklist", tags=["blacklist"], dependencies=[Depends(require_api_key)]
|
||||||
|
)
|
||||||
|
|
||||||
|
Kind = Literal["performer", "studio", "tag"]
|
||||||
|
|
||||||
|
|
||||||
|
class BlacklistEntry(BaseModel):
|
||||||
|
id: uuid.UUID
|
||||||
|
name: str # canonical_name (performer) / name (studio/tag)
|
||||||
|
slug: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class BlacklistOut(BaseModel):
|
||||||
|
performers: list[BlacklistEntry]
|
||||||
|
studios: list[BlacklistEntry]
|
||||||
|
tags: list[BlacklistEntry]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=BlacklistOut)
|
||||||
|
def list_blacklist(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> BlacklistOut:
|
||||||
|
perfs = session.execute(
|
||||||
|
select(BlacklistedPerformer.performer_id, Performer.canonical_name, Performer.slug)
|
||||||
|
.join(Performer, Performer.id == BlacklistedPerformer.performer_id)
|
||||||
|
.order_by(Performer.canonical_name)
|
||||||
|
).all()
|
||||||
|
studios = session.execute(
|
||||||
|
select(BlacklistedStudio.studio_id, Studio.name, Studio.slug)
|
||||||
|
.join(Studio, Studio.id == BlacklistedStudio.studio_id)
|
||||||
|
.order_by(Studio.name)
|
||||||
|
).all()
|
||||||
|
tags = session.execute(
|
||||||
|
select(BlacklistedTag.tag_id, Tag.name, Tag.slug)
|
||||||
|
.join(Tag, Tag.id == BlacklistedTag.tag_id)
|
||||||
|
.order_by(Tag.name)
|
||||||
|
).all()
|
||||||
|
return BlacklistOut(
|
||||||
|
performers=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in perfs],
|
||||||
|
studios=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in studios],
|
||||||
|
tags=[BlacklistEntry(id=r[0], name=r[1], slug=r[2]) for r in tags],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _kind_to_entity(kind: Kind):
|
||||||
|
if kind == "performer":
|
||||||
|
return BlacklistedPerformer, Performer, "performer_id"
|
||||||
|
if kind == "studio":
|
||||||
|
return BlacklistedStudio, Studio, "studio_id"
|
||||||
|
if kind == "tag":
|
||||||
|
return BlacklistedTag, Tag, "tag_id"
|
||||||
|
raise HTTPException(status_code=400, detail="kind must be performer|studio|tag")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{kind}/{entity_id}", status_code=status.HTTP_200_OK)
|
||||||
|
def add_blacklist(
|
||||||
|
kind: Kind,
|
||||||
|
entity_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> dict:
|
||||||
|
bl_model, parent_model, fk = _kind_to_entity(kind)
|
||||||
|
if session.get(parent_model, entity_id) is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"{kind} not found")
|
||||||
|
if session.get(bl_model, entity_id) is not None:
|
||||||
|
return {"kind": kind, "id": str(entity_id), "created": False}
|
||||||
|
session.add(bl_model(**{fk: entity_id}))
|
||||||
|
session.commit()
|
||||||
|
return {"kind": kind, "id": str(entity_id), "created": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{kind}/{entity_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_blacklist(
|
||||||
|
kind: Kind,
|
||||||
|
entity_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
bl_model, _, _ = _kind_to_entity(kind)
|
||||||
|
row = session.get(bl_model, entity_id)
|
||||||
|
if row is None:
|
||||||
|
return # idempotent
|
||||||
|
session.delete(row)
|
||||||
|
session.commit()
|
||||||
155
app/api/bug_reports.py
Normal file
155
app/api/bug_reports.py
Normal file
|
|
@ -0,0 +1,155 @@
|
||||||
|
"""Bug reports — mobile FAB → POST /bug-reports → admin lista przez admin_html.
|
||||||
|
|
||||||
|
POST nie wymaga obecnego scene_id (user może raportować z FavoritesScreen,
|
||||||
|
SearchScreen itp.). Screenshot opcjonalny — niektóre ekrany nie warto kapturować.
|
||||||
|
|
||||||
|
Limit body 1.5MB (FastAPI default jest hojny, ale dla rozsądku ograniczamy).
|
||||||
|
Screenshot to PNG/JPEG z react-native-view-shot, base64 — typowe rozmiary:
|
||||||
|
- mały ekran scene-list: ~200-400KB
|
||||||
|
- duży scene-detail z thumbnail: ~600KB-1MB
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from sqlalchemy import desc, func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.bug_report import BugReport
|
||||||
|
from app.models.movie import Movie
|
||||||
|
from app.models.scene import Scene
|
||||||
|
|
||||||
|
router = APIRouter(tags=["bug-reports"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_SCREENSHOT_BYTES = 1_500_000 # raw base64 chars; ~1.1MB binary po dekodowaniu
|
||||||
|
|
||||||
|
|
||||||
|
class BugReportCreate(BaseModel):
|
||||||
|
message: str = Field(min_length=1, max_length=5000)
|
||||||
|
screen_name: str | None = Field(default=None, max_length=64)
|
||||||
|
app_version: str | None = Field(default=None, max_length=32)
|
||||||
|
scene_id: uuid.UUID | None = None
|
||||||
|
screenshot_b64: str | None = Field(default=None, max_length=_MAX_SCREENSHOT_BYTES)
|
||||||
|
|
||||||
|
|
||||||
|
class BugReportOut(BaseModel):
|
||||||
|
id: uuid.UUID
|
||||||
|
created_at: datetime
|
||||||
|
screen_name: str | None
|
||||||
|
app_version: str | None
|
||||||
|
scene_id: uuid.UUID | None
|
||||||
|
movie_id: uuid.UUID | None
|
||||||
|
message: str
|
||||||
|
has_screenshot: bool
|
||||||
|
resolved: bool
|
||||||
|
|
||||||
|
|
||||||
|
class BugReportListOut(BaseModel):
|
||||||
|
items: list[BugReportOut]
|
||||||
|
total: int
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/bug-reports", status_code=status.HTTP_201_CREATED)
|
||||||
|
def create_bug_report(
|
||||||
|
payload: BugReportCreate,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
# Smart-route entity_id: mobile Player używa `sceneId` param zarówno dla
|
||||||
|
# scen jak i movies (legacy progress tracking hack). Bez tego INSERT FK
|
||||||
|
# violation crashował 500 (zgłoszone 2026-05-10). Sprawdź obie tabele.
|
||||||
|
scene_id: uuid.UUID | None = None
|
||||||
|
movie_id: uuid.UUID | None = None
|
||||||
|
if payload.scene_id is not None:
|
||||||
|
if session.get(Scene, payload.scene_id) is not None:
|
||||||
|
scene_id = payload.scene_id
|
||||||
|
elif session.get(Movie, payload.scene_id) is not None:
|
||||||
|
movie_id = payload.scene_id
|
||||||
|
# else: ID nie istnieje już nigdzie (deleted) — drop oba na null
|
||||||
|
|
||||||
|
br = BugReport(
|
||||||
|
id=uuid.uuid4(),
|
||||||
|
message=payload.message.strip(),
|
||||||
|
screen_name=payload.screen_name,
|
||||||
|
app_version=payload.app_version,
|
||||||
|
scene_id=scene_id,
|
||||||
|
movie_id=movie_id,
|
||||||
|
screenshot_b64=payload.screenshot_b64,
|
||||||
|
)
|
||||||
|
session.add(br)
|
||||||
|
session.commit()
|
||||||
|
return {"id": str(br.id)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/bug-reports", response_model=BugReportListOut)
|
||||||
|
def list_bug_reports(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
limit: int = 50,
|
||||||
|
offset: int = 0,
|
||||||
|
include_resolved: bool = False,
|
||||||
|
) -> BugReportListOut:
|
||||||
|
q = select(BugReport).order_by(desc(BugReport.created_at))
|
||||||
|
cnt_q = select(func.count(BugReport.id))
|
||||||
|
if not include_resolved:
|
||||||
|
q = q.where(BugReport.resolved.is_(False))
|
||||||
|
cnt_q = cnt_q.where(BugReport.resolved.is_(False))
|
||||||
|
rows = session.scalars(q.limit(limit).offset(offset)).all()
|
||||||
|
total = session.scalar(cnt_q) or 0
|
||||||
|
items = [
|
||||||
|
BugReportOut(
|
||||||
|
id=r.id,
|
||||||
|
created_at=r.created_at,
|
||||||
|
screen_name=r.screen_name,
|
||||||
|
app_version=r.app_version,
|
||||||
|
scene_id=r.scene_id,
|
||||||
|
movie_id=r.movie_id,
|
||||||
|
message=r.message,
|
||||||
|
has_screenshot=bool(r.screenshot_b64),
|
||||||
|
resolved=r.resolved,
|
||||||
|
)
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
return BugReportListOut(items=items, total=total)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/bug-reports/{bug_id}/screenshot")
|
||||||
|
def get_bug_report_screenshot(
|
||||||
|
bug_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> dict[str, str | None]:
|
||||||
|
"""Zwraca base64-encoded screenshot (jeśli jest) — admin UI go renderuje."""
|
||||||
|
br = session.get(BugReport, bug_id)
|
||||||
|
if br is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not found")
|
||||||
|
return {"screenshot_b64": br.screenshot_b64}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/bug-reports/{bug_id}/resolve")
|
||||||
|
def resolve_bug_report(
|
||||||
|
bug_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
br = session.get(BugReport, bug_id)
|
||||||
|
if br is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not found")
|
||||||
|
br.resolved = True
|
||||||
|
session.commit()
|
||||||
|
return {"status": "resolved"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/bug-reports/{bug_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def delete_bug_report(
|
||||||
|
bug_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
br = session.get(BugReport, bug_id)
|
||||||
|
if br is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not found")
|
||||||
|
session.delete(br)
|
||||||
|
session.commit()
|
||||||
104
app/api/expo_updates.py
Normal file
104
app/api/expo_updates.py
Normal file
|
|
@ -0,0 +1,104 @@
|
||||||
|
"""Expo Updates serving endpoints (OTA JS bundle distribution).
|
||||||
|
|
||||||
|
Mobile sprawdza `/expo-updates/manifest` przy każdym launch (lub on-foreground).
|
||||||
|
Serwer zwraca aktualny manifest dla danego `expo-runtime-version`. Mobile pobiera
|
||||||
|
launchAsset (bundle) + assets, zapisuje, restartuje aplikację z nowym bundle.
|
||||||
|
|
||||||
|
Każdy update wgrany przez `scripts/publish_update.py` ląduje w
|
||||||
|
`app/static/expo-updates/<runtime>/<update_id>/`. Plik
|
||||||
|
`app/static/expo-updates/<runtime>/current.json` wskazuje aktywny update_id.
|
||||||
|
|
||||||
|
Endpointy SĄ PUBLICZNE (no auth) — Expo Updates SDK nie wstrzykuje X-API-Key.
|
||||||
|
Bezpieczeństwo opiera się na TLS pinningu (mobile ufa tylko naszej self-signed
|
||||||
|
cert SPKI z network_security_config) — ktoś bez tego pinu nie podstawi MITM
|
||||||
|
manifestu. Jeśli kiedyś trzeba twardo: dorobić expo-updates code signing key.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException, Query
|
||||||
|
from fastapi.responses import FileResponse, JSONResponse, Response
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(tags=["expo-updates"])
|
||||||
|
|
||||||
|
_STATIC_DIR = Path(__file__).resolve().parent.parent / "static" / "expo-updates"
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/expo-updates/manifest")
|
||||||
|
def get_manifest(
|
||||||
|
expo_runtime_version: str | None = Header(default=None, alias="expo-runtime-version"),
|
||||||
|
expo_platform: str | None = Header(default=None, alias="expo-platform"),
|
||||||
|
) -> Response:
|
||||||
|
"""Zwraca aktualny manifest dla podanego `expo-runtime-version` (default 1.0)
|
||||||
|
+ platform (default android — i tak tylko Android wspieramy).
|
||||||
|
|
||||||
|
204 No Content gdy nie ma update'u dla tego runtime'u → klient nadal odpala
|
||||||
|
embedded bundle z APK. Mobile zna `expo-protocol-version` (single-manifest
|
||||||
|
Mode), więc nie potrzebujemy multipart.
|
||||||
|
"""
|
||||||
|
runtime = expo_runtime_version or "1.0"
|
||||||
|
runtime_dir = _STATIC_DIR / runtime
|
||||||
|
current_file = runtime_dir / "current.json"
|
||||||
|
if not current_file.exists():
|
||||||
|
return Response(status_code=204)
|
||||||
|
|
||||||
|
try:
|
||||||
|
current = json.loads(current_file.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, json.JSONDecodeError) as e:
|
||||||
|
log.warning("expo-updates: bad current.json for runtime=%s: %s", runtime, e)
|
||||||
|
return Response(status_code=204)
|
||||||
|
|
||||||
|
update_id = current.get("update_id")
|
||||||
|
if not update_id:
|
||||||
|
return Response(status_code=204)
|
||||||
|
|
||||||
|
manifest_file = runtime_dir / update_id / "manifest.json"
|
||||||
|
if not manifest_file.exists():
|
||||||
|
log.warning("expo-updates: current points to missing update %s", update_id)
|
||||||
|
return Response(status_code=204)
|
||||||
|
|
||||||
|
try:
|
||||||
|
manifest = json.loads(manifest_file.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, json.JSONDecodeError) as e:
|
||||||
|
log.error("expo-updates: bad manifest.json for %s: %s", update_id, e)
|
||||||
|
return Response(status_code=204)
|
||||||
|
|
||||||
|
return JSONResponse(
|
||||||
|
manifest,
|
||||||
|
headers={
|
||||||
|
"expo-protocol-version": "1",
|
||||||
|
"expo-sfv-version": "0",
|
||||||
|
"cache-control": "private, max-age=0",
|
||||||
|
"content-type": "application/json; charset=utf-8",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/expo-updates/asset")
|
||||||
|
def get_asset(
|
||||||
|
asset: str = Query(..., description="Relative path do pliku w runtime dir"),
|
||||||
|
runtimeVersion: str = Query("1.0"),
|
||||||
|
platform: str = Query("android"),
|
||||||
|
) -> Response:
|
||||||
|
"""Serwuje pojedynczy asset (JS bundle, image, font) z update directory.
|
||||||
|
|
||||||
|
`asset` to relative path względem `static/expo-updates/<runtime>/` —
|
||||||
|
zwykle `<update_id>/_expo/static/js/android/<hash>.js` lub
|
||||||
|
`<update_id>/assets/<hash>`. Path traversal blocked przez resolve+is_relative.
|
||||||
|
"""
|
||||||
|
runtime_dir = (_STATIC_DIR / runtimeVersion).resolve()
|
||||||
|
target = (runtime_dir / asset).resolve()
|
||||||
|
if not str(target).startswith(str(runtime_dir)):
|
||||||
|
raise HTTPException(status_code=400, detail="invalid asset path")
|
||||||
|
if not target.exists() or not target.is_file():
|
||||||
|
raise HTTPException(status_code=404, detail="asset not found")
|
||||||
|
# Content type — bundle to text/javascript, reszta autodetect przez FileResponse.
|
||||||
|
media_type = None
|
||||||
|
if target.suffix in (".js", ".bundle"):
|
||||||
|
media_type = "application/javascript"
|
||||||
|
return FileResponse(target, media_type=media_type)
|
||||||
457
app/api/favorites.py
Normal file
457
app/api/favorites.py
Normal file
|
|
@ -0,0 +1,457 @@
|
||||||
|
"""Favorites — ulubione performerki + studia + liczenie nowych scen.
|
||||||
|
|
||||||
|
Single-user (brak users), więc API zwraca/operuje na global zbiorze. Multi-user
|
||||||
|
można dodać dorzuceniem `user_id` query/header bez breaking change.
|
||||||
|
|
||||||
|
Endpointy (performers — `/favorites/...` zostawione żeby nie łamać starego mobile):
|
||||||
|
GET /favorites — lista ulubionych performerek
|
||||||
|
POST /favorites/{performer_id} — dodaj (idempotent)
|
||||||
|
DELETE /favorites/{performer_id} — usuń
|
||||||
|
POST /favorites/{performer_id}/seen — mark-as-seen (zeruje badge)
|
||||||
|
|
||||||
|
Endpointy (studios):
|
||||||
|
GET /favorites/studios — lista ulubionych studiów
|
||||||
|
POST /favorites/studios/{studio_id} — dodaj
|
||||||
|
DELETE /favorites/studios/{studio_id} — usuń
|
||||||
|
POST /favorites/studios/{studio_id}/seen — mark-as-seen
|
||||||
|
|
||||||
|
"Nowa scena" = scena której Scene.created_at > favorite.last_seen_at:
|
||||||
|
- dla performerki: ScenePerformer.performer_id = X
|
||||||
|
- dla studio: Scene.studio_id = X
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy import func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.favorite_movie import FavoriteMovie
|
||||||
|
from app.models.favorite_performer import FavoritePerformer
|
||||||
|
from app.models.favorite_studio import FavoriteStudio
|
||||||
|
from app.models.movie import Movie
|
||||||
|
from app.models.performer import Performer
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import Scene, ScenePerformer
|
||||||
|
from app.models.studio import Studio
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/favorites", tags=["favorites"], dependencies=[Depends(require_api_key)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteOut(BaseModel):
|
||||||
|
performer_id: uuid.UUID
|
||||||
|
canonical_name: str
|
||||||
|
slug: str | None
|
||||||
|
scene_count: int
|
||||||
|
new_count: int # sceny od last_seen_at
|
||||||
|
last_seen_at: datetime
|
||||||
|
created_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteListOut(BaseModel):
|
||||||
|
items: list[FavoriteOut]
|
||||||
|
total: int
|
||||||
|
new_total: int # suma new_count po wszystkich — dla badge w toolbar
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=FavoriteListOut)
|
||||||
|
def list_favorites(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteListOut:
|
||||||
|
rows = session.execute(
|
||||||
|
select(FavoritePerformer, Performer)
|
||||||
|
.join(Performer, Performer.id == FavoritePerformer.performer_id)
|
||||||
|
.order_by(Performer.canonical_name)
|
||||||
|
).all()
|
||||||
|
if not rows:
|
||||||
|
return FavoriteListOut(items=[], total=0, new_total=0)
|
||||||
|
|
||||||
|
perf_ids = [perf.id for _, perf in rows]
|
||||||
|
last_seen_by_perf = {fav.performer_id: fav.last_seen_at for fav, _ in rows}
|
||||||
|
|
||||||
|
# Batch: scene_count per performer — filtrujemy `has_live_playback` żeby badge
|
||||||
|
# `N scenes` zgadzał się z tym co widać w PerformerScenes (mobile filtruje
|
||||||
|
# `has_playback=true`). TPDB/StashDB sync wstawia metadata-only stubs które wlicz
|
||||||
|
# by się w 2062 dla Aletta Ocean ale w profilu pokazuje tylko 499 oglądalnych.
|
||||||
|
from sqlalchemy import and_, exists
|
||||||
|
_scene_count_live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == ScenePerformer.scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
scene_counts: dict = dict(
|
||||||
|
session.execute(
|
||||||
|
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id))
|
||||||
|
.where(ScenePerformer.performer_id.in_(perf_ids))
|
||||||
|
.where(_scene_count_live_playback)
|
||||||
|
.group_by(ScenePerformer.performer_id)
|
||||||
|
).all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Batch: new_count per performer — sceny z created_at > last_seen_at favorite'a.
|
||||||
|
# Każda performerka ma INNY last_seen_at, więc warunek per-row. Trick: GREATEST jest
|
||||||
|
# nieważny — robimy CASE per row z mapowaniem perf_id → last_seen przez VALUES list.
|
||||||
|
# Prościej: jeden join + WHERE z OR po wszystkich (perf_id=X AND created_at>ts_X) —
|
||||||
|
# ale to N OR-ów. Najczystsze rozwiązanie: zapytaj per-row ale wszystkie naraz w
|
||||||
|
# SQL używając IN tuple lub sub-query. Tu korzystamy z faktu że N=14 typowo, więc
|
||||||
|
# robimy unionall albo prosty (perf_id, last_seen_at) JOIN.
|
||||||
|
new_counts: dict = {}
|
||||||
|
if perf_ids:
|
||||||
|
# Liczymy TYLKO sceny z żywym playback_source (has_live_playback). Powód:
|
||||||
|
# TPDB/StashDB sync wstawia metadata-only stubs (52 scen Danielle Renae jednego
|
||||||
|
# dnia z 0 playback) — bumpują created_at, badge `+N`, ale w PerformerScenes
|
||||||
|
# mobile filtruje `has_playback=true` → 0 widocznych. Result: user widzi +48
|
||||||
|
# ale w profilu nic nowego. Filter aligns count z faktycznie oglądalnym
|
||||||
|
# contentem ("new znalezisko" = scena którą da się odtworzyć).
|
||||||
|
from sqlalchemy import and_, exists
|
||||||
|
live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
per_scene_rows = session.execute(
|
||||||
|
select(ScenePerformer.performer_id, Scene.created_at)
|
||||||
|
.join(Scene, Scene.id == ScenePerformer.scene_id)
|
||||||
|
.where(ScenePerformer.performer_id.in_(perf_ids))
|
||||||
|
.where(live_playback)
|
||||||
|
).all()
|
||||||
|
for pid, created_at in per_scene_rows:
|
||||||
|
if created_at is None:
|
||||||
|
continue
|
||||||
|
if created_at > last_seen_by_perf.get(pid):
|
||||||
|
new_counts[pid] = new_counts.get(pid, 0) + 1
|
||||||
|
|
||||||
|
items: list[FavoriteOut] = []
|
||||||
|
new_total = 0
|
||||||
|
for fav, perf in rows:
|
||||||
|
nc = new_counts.get(perf.id, 0)
|
||||||
|
new_total += nc
|
||||||
|
items.append(
|
||||||
|
FavoriteOut(
|
||||||
|
performer_id=perf.id,
|
||||||
|
canonical_name=perf.canonical_name,
|
||||||
|
slug=perf.slug,
|
||||||
|
scene_count=scene_counts.get(perf.id, 0),
|
||||||
|
new_count=nc,
|
||||||
|
last_seen_at=fav.last_seen_at,
|
||||||
|
created_at=fav.created_at,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return FavoriteListOut(items=items, total=len(items), new_total=new_total)
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteAddOut(BaseModel):
|
||||||
|
performer_id: uuid.UUID
|
||||||
|
created: bool
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/{performer_id}",
|
||||||
|
response_model=FavoriteAddOut,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
)
|
||||||
|
def add_favorite(
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteAddOut:
|
||||||
|
perf = session.get(Performer, performer_id)
|
||||||
|
if perf is None:
|
||||||
|
raise HTTPException(status_code=404, detail="performer not found")
|
||||||
|
existing = session.get(FavoritePerformer, performer_id)
|
||||||
|
if existing is not None:
|
||||||
|
return FavoriteAddOut(performer_id=performer_id, created=False)
|
||||||
|
session.add(FavoritePerformer(performer_id=performer_id))
|
||||||
|
session.commit()
|
||||||
|
return FavoriteAddOut(performer_id=performer_id, created=True)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{performer_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_favorite(
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
fav = session.get(FavoritePerformer, performer_id)
|
||||||
|
if fav is None:
|
||||||
|
# idempotent — brak ulubionego = nie ma nic do usunięcia, success
|
||||||
|
return
|
||||||
|
session.delete(fav)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class SeenOut(BaseModel):
|
||||||
|
performer_id: uuid.UUID
|
||||||
|
last_seen_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{performer_id}/seen", response_model=SeenOut)
|
||||||
|
def mark_seen(
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SeenOut:
|
||||||
|
fav = session.get(FavoritePerformer, performer_id)
|
||||||
|
if fav is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not in favorites")
|
||||||
|
fav.last_seen_at = datetime.now(UTC)
|
||||||
|
session.commit()
|
||||||
|
return SeenOut(performer_id=performer_id, last_seen_at=fav.last_seen_at)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- Studios ----------
|
||||||
|
|
||||||
|
class FavoriteStudioOut(BaseModel):
|
||||||
|
studio_id: uuid.UUID
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
network: str | None = None
|
||||||
|
scene_count: int
|
||||||
|
new_count: int
|
||||||
|
last_seen_at: datetime
|
||||||
|
created_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteStudioListOut(BaseModel):
|
||||||
|
items: list[FavoriteStudioOut]
|
||||||
|
total: int
|
||||||
|
new_total: int
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/studios", response_model=FavoriteStudioListOut)
|
||||||
|
def list_favorite_studios(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteStudioListOut:
|
||||||
|
rows = session.execute(
|
||||||
|
select(FavoriteStudio, Studio)
|
||||||
|
.join(Studio, Studio.id == FavoriteStudio.studio_id)
|
||||||
|
.order_by(Studio.name)
|
||||||
|
).all()
|
||||||
|
if not rows:
|
||||||
|
return FavoriteStudioListOut(items=[], total=0, new_total=0)
|
||||||
|
|
||||||
|
studio_ids = [st.id for _, st in rows]
|
||||||
|
last_seen_by_studio = {fav.studio_id: fav.last_seen_at for fav, _ in rows}
|
||||||
|
|
||||||
|
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
|
||||||
|
from sqlalchemy import and_, exists
|
||||||
|
_studio_count_live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
scene_counts: dict = dict(
|
||||||
|
session.execute(
|
||||||
|
select(Scene.studio_id, func.count(Scene.id))
|
||||||
|
.where(Scene.studio_id.in_(studio_ids))
|
||||||
|
.where(_studio_count_live_playback)
|
||||||
|
.group_by(Scene.studio_id)
|
||||||
|
).all()
|
||||||
|
)
|
||||||
|
|
||||||
|
new_counts: dict = {}
|
||||||
|
if studio_ids:
|
||||||
|
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
|
||||||
|
from sqlalchemy import and_, exists
|
||||||
|
live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
per_scene_rows = session.execute(
|
||||||
|
select(Scene.studio_id, Scene.created_at)
|
||||||
|
.where(Scene.studio_id.in_(studio_ids))
|
||||||
|
.where(live_playback)
|
||||||
|
).all()
|
||||||
|
for sid, created_at in per_scene_rows:
|
||||||
|
if created_at is None:
|
||||||
|
continue
|
||||||
|
if created_at > last_seen_by_studio.get(sid):
|
||||||
|
new_counts[sid] = new_counts.get(sid, 0) + 1
|
||||||
|
|
||||||
|
items: list[FavoriteStudioOut] = []
|
||||||
|
new_total = 0
|
||||||
|
for fav, st in rows:
|
||||||
|
nc = new_counts.get(st.id, 0)
|
||||||
|
new_total += nc
|
||||||
|
items.append(
|
||||||
|
FavoriteStudioOut(
|
||||||
|
studio_id=st.id,
|
||||||
|
name=st.name,
|
||||||
|
slug=st.slug,
|
||||||
|
network=st.network,
|
||||||
|
scene_count=scene_counts.get(st.id, 0),
|
||||||
|
new_count=nc,
|
||||||
|
last_seen_at=fav.last_seen_at,
|
||||||
|
created_at=fav.created_at,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return FavoriteStudioListOut(items=items, total=len(items), new_total=new_total)
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteStudioAddOut(BaseModel):
|
||||||
|
studio_id: uuid.UUID
|
||||||
|
created: bool
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/studios/{studio_id}",
|
||||||
|
response_model=FavoriteStudioAddOut,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
)
|
||||||
|
def add_favorite_studio(
|
||||||
|
studio_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteStudioAddOut:
|
||||||
|
st = session.get(Studio, studio_id)
|
||||||
|
if st is None:
|
||||||
|
raise HTTPException(status_code=404, detail="studio not found")
|
||||||
|
existing = session.get(FavoriteStudio, studio_id)
|
||||||
|
if existing is not None:
|
||||||
|
return FavoriteStudioAddOut(studio_id=studio_id, created=False)
|
||||||
|
session.add(FavoriteStudio(studio_id=studio_id))
|
||||||
|
session.commit()
|
||||||
|
return FavoriteStudioAddOut(studio_id=studio_id, created=True)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/studios/{studio_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_favorite_studio(
|
||||||
|
studio_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
fav = session.get(FavoriteStudio, studio_id)
|
||||||
|
if fav is None:
|
||||||
|
return
|
||||||
|
session.delete(fav)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class SeenStudioOut(BaseModel):
|
||||||
|
studio_id: uuid.UUID
|
||||||
|
last_seen_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/studios/{studio_id}/seen", response_model=SeenStudioOut)
|
||||||
|
def mark_studio_seen(
|
||||||
|
studio_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SeenStudioOut:
|
||||||
|
fav = session.get(FavoriteStudio, studio_id)
|
||||||
|
if fav is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not in favorites")
|
||||||
|
fav.last_seen_at = datetime.now(UTC)
|
||||||
|
session.commit()
|
||||||
|
return SeenStudioOut(studio_id=studio_id, last_seen_at=fav.last_seen_at)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Favorite movies ────────────────────────────────────────────────────────
|
||||||
|
# Movies nie mają child scenes per-favorite (jak performerki/studia), więc
|
||||||
|
# `last_seen_at` nie jest tu używany do NEW count — tylko jako tracking ostatniego
|
||||||
|
# wglądu przez usera. Mobile używa NEW badge w liście /movies przez OSOBNY
|
||||||
|
# globalny last_seen z AsyncStorage (client-side, brak backendowego state).
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteMovieOut(BaseModel):
|
||||||
|
movie_id: uuid.UUID
|
||||||
|
title: str
|
||||||
|
slug: str | None
|
||||||
|
poster_url: str | None
|
||||||
|
release_year: int | None
|
||||||
|
studio_name: str | None
|
||||||
|
last_seen_at: datetime
|
||||||
|
created_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteMovieListOut(BaseModel):
|
||||||
|
items: list[FavoriteMovieOut]
|
||||||
|
total: int
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/movies", response_model=FavoriteMovieListOut)
|
||||||
|
def list_favorite_movies(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteMovieListOut:
|
||||||
|
rows = session.execute(
|
||||||
|
select(FavoriteMovie, Movie, Studio)
|
||||||
|
.join(Movie, Movie.id == FavoriteMovie.movie_id)
|
||||||
|
.outerjoin(Studio, Studio.id == Movie.studio_id)
|
||||||
|
.order_by(Movie.title)
|
||||||
|
).all()
|
||||||
|
items = [
|
||||||
|
FavoriteMovieOut(
|
||||||
|
movie_id=movie.id,
|
||||||
|
title=movie.title,
|
||||||
|
slug=movie.slug,
|
||||||
|
poster_url=movie.poster_url,
|
||||||
|
release_year=movie.release_year,
|
||||||
|
studio_name=studio.name if studio else None,
|
||||||
|
last_seen_at=fav.last_seen_at,
|
||||||
|
created_at=fav.created_at,
|
||||||
|
)
|
||||||
|
for fav, movie, studio in rows
|
||||||
|
]
|
||||||
|
return FavoriteMovieListOut(items=items, total=len(items))
|
||||||
|
|
||||||
|
|
||||||
|
class FavoriteMovieAddOut(BaseModel):
|
||||||
|
movie_id: uuid.UUID
|
||||||
|
created: bool
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/movies/{movie_id}",
|
||||||
|
response_model=FavoriteMovieAddOut,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
)
|
||||||
|
def add_favorite_movie(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> FavoriteMovieAddOut:
|
||||||
|
movie = session.get(Movie, movie_id)
|
||||||
|
if movie is None:
|
||||||
|
raise HTTPException(status_code=404, detail="movie not found")
|
||||||
|
existing = session.get(FavoriteMovie, movie_id)
|
||||||
|
if existing is not None:
|
||||||
|
return FavoriteMovieAddOut(movie_id=movie_id, created=False)
|
||||||
|
session.add(FavoriteMovie(movie_id=movie_id))
|
||||||
|
session.commit()
|
||||||
|
return FavoriteMovieAddOut(movie_id=movie_id, created=True)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/movies/{movie_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_favorite_movie(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
fav = session.get(FavoriteMovie, movie_id)
|
||||||
|
if fav is None:
|
||||||
|
return
|
||||||
|
session.delete(fav)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class SeenMovieOut(BaseModel):
|
||||||
|
movie_id: uuid.UUID
|
||||||
|
last_seen_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/movies/{movie_id}/seen", response_model=SeenMovieOut)
|
||||||
|
def mark_movie_seen(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SeenMovieOut:
|
||||||
|
fav = session.get(FavoriteMovie, movie_id)
|
||||||
|
if fav is None:
|
||||||
|
raise HTTPException(status_code=404, detail="not in favorites")
|
||||||
|
fav.last_seen_at = datetime.now(UTC)
|
||||||
|
session.commit()
|
||||||
|
return SeenMovieOut(movie_id=movie_id, last_seen_at=fav.last_seen_at)
|
||||||
275
app/api/movies.py
Normal file
275
app/api/movies.py
Normal file
|
|
@ -0,0 +1,275 @@
|
||||||
|
"""GET /movies — lista i szczegóły filmów."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
from sqlalchemy import exists, func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.schemas import (
|
||||||
|
ExternalRefOut,
|
||||||
|
MovieChapterOut,
|
||||||
|
MovieListOut,
|
||||||
|
MovieOut,
|
||||||
|
PerformerOut,
|
||||||
|
PlaybackSourceOut,
|
||||||
|
StudioOut,
|
||||||
|
TagOut,
|
||||||
|
)
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.movie import (
|
||||||
|
Movie,
|
||||||
|
MovieChapter,
|
||||||
|
MovieExternalRef,
|
||||||
|
MoviePerformer,
|
||||||
|
MovieTag,
|
||||||
|
)
|
||||||
|
from app.models.favorite_movie import FavoriteMovie
|
||||||
|
from app.models.movie_playback_source import MoviePlaybackSource
|
||||||
|
from app.models.performer import Performer
|
||||||
|
from app.models.source import Source
|
||||||
|
from app.models.studio import Studio
|
||||||
|
from app.models.tag import Tag
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/movies", tags=["movies"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
_VALID_SORTS = {"created_at", "release_year", "release_date", "title", "rating"}
|
||||||
|
|
||||||
|
|
||||||
|
def _split_csv(raw: str | None) -> list[str]:
|
||||||
|
if not raw:
|
||||||
|
return []
|
||||||
|
return [s.strip() for s in raw.split(",") if s.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=MovieListOut)
|
||||||
|
def list_movies(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
q: str | None = Query(default=None, description="Title search (trgm)"),
|
||||||
|
studio_slugs: str | None = Query(default=None, description="Comma-separated studio slugs (OR)"),
|
||||||
|
tags: str | None = Query(default=None, description="Comma-separated tag slugs (AND)"),
|
||||||
|
performer_ids: str | None = Query(default=None, description="Comma-separated performer UUIDs (AND)"),
|
||||||
|
year_from: int | None = Query(default=None, ge=1900, le=2100),
|
||||||
|
year_to: int | None = Query(default=None, ge=1900, le=2100),
|
||||||
|
has_playback: bool | None = Query(default=None),
|
||||||
|
sort: str = Query(default="created_at"),
|
||||||
|
page: int = Query(default=1, ge=1),
|
||||||
|
per_page: int = Query(default=50, ge=1, le=200),
|
||||||
|
) -> MovieListOut:
|
||||||
|
if sort not in _VALID_SORTS:
|
||||||
|
raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")
|
||||||
|
|
||||||
|
base = select(Movie)
|
||||||
|
|
||||||
|
if q:
|
||||||
|
base = base.where(Movie.title_normalized.ilike(f"%{q.lower()}%"))
|
||||||
|
|
||||||
|
studio_slug_list = _split_csv(studio_slugs)
|
||||||
|
if studio_slug_list:
|
||||||
|
base = base.where(
|
||||||
|
Movie.studio_id.in_(select(Studio.id).where(Studio.slug.in_(studio_slug_list)))
|
||||||
|
)
|
||||||
|
|
||||||
|
for slug in _split_csv(tags):
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).select_from(MovieTag).join(Tag, Tag.id == MovieTag.tag_id)
|
||||||
|
.where(MovieTag.movie_id == Movie.id, Tag.slug == slug)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
perf_id_strings = _split_csv(performer_ids)
|
||||||
|
if perf_id_strings:
|
||||||
|
try:
|
||||||
|
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
|
||||||
|
for pid in perf_ids:
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).select_from(MoviePerformer).where(
|
||||||
|
MoviePerformer.movie_id == Movie.id,
|
||||||
|
MoviePerformer.performer_id == pid,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if year_from is not None:
|
||||||
|
base = base.where(Movie.release_year >= year_from)
|
||||||
|
if year_to is not None:
|
||||||
|
base = base.where(Movie.release_year <= year_to)
|
||||||
|
|
||||||
|
if has_playback is True:
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).where(
|
||||||
|
MoviePlaybackSource.movie_id == Movie.id,
|
||||||
|
MoviePlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
total = session.execute(
|
||||||
|
select(func.count()).select_from(base.subquery())
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
if sort == "created_at":
|
||||||
|
base = base.order_by(Movie.created_at.desc())
|
||||||
|
elif sort == "release_year":
|
||||||
|
base = base.order_by(Movie.release_year.desc().nulls_last(), Movie.created_at.desc())
|
||||||
|
elif sort == "release_date":
|
||||||
|
base = base.order_by(Movie.release_date.desc().nulls_last(), Movie.created_at.desc())
|
||||||
|
elif sort == "title":
|
||||||
|
base = base.order_by(Movie.title_normalized.asc())
|
||||||
|
elif sort == "rating":
|
||||||
|
base = base.order_by(Movie.rating.desc().nulls_last(), Movie.created_at.desc())
|
||||||
|
|
||||||
|
base = base.limit(per_page).offset((page - 1) * per_page)
|
||||||
|
|
||||||
|
movies = session.execute(base).scalars().all()
|
||||||
|
items = [_movie_to_out(session, m) for m in movies]
|
||||||
|
|
||||||
|
return MovieListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
# Movie playback origin policy — module-level (kiedyś było inline per-request
|
||||||
|
# definition, code-review #19 — perf hit + dorosły kod).
|
||||||
|
# Ranking ustalony ad-hoc 2026-05-09 (extract_stream_from_hoster na 5 sample
|
||||||
|
# random per origin).
|
||||||
|
_MOVIE_PREFERRED_ORIGINS = (
|
||||||
|
"mangoporn:luluvid", # KVS, działa
|
||||||
|
"mangoporn:mixdrop", # po domain fix może działać
|
||||||
|
"mangoporn:voe", # czasem yt-dlp łapie
|
||||||
|
"mangoporn",
|
||||||
|
"streamporn",
|
||||||
|
"pandamovies",
|
||||||
|
)
|
||||||
|
# File hosters które NIGDY nie dadzą się stream-extract bez premium account —
|
||||||
|
# odfiltrowywane całkowicie (zaśmiecały listę watch options, bug-report
|
||||||
|
# 2026-05-15). Streamtape przywrócony 2026-05-15 — ma dedicated extractor,
|
||||||
|
# ~5% URLów żyje.
|
||||||
|
_MOVIE_DROP_ORIGINS = frozenset({
|
||||||
|
"mangoporn:rapidgator",
|
||||||
|
"mangoporn:nitroflare",
|
||||||
|
"mangoporn:frdl",
|
||||||
|
})
|
||||||
|
# Raw landing origins ukrywane gdy są sub-hosters (zob. komentarz w get_movie).
|
||||||
|
_MOVIE_LANDING_HIDE = frozenset({"mangoporn", "pandamovies", "streamporn"})
|
||||||
|
|
||||||
|
|
||||||
|
def _movie_origin_priority(origin: str) -> int:
|
||||||
|
try:
|
||||||
|
return _MOVIE_PREFERRED_ORIGINS.index(origin)
|
||||||
|
except ValueError:
|
||||||
|
return 500 # neutralne (paradisehill, mangoporn:* nieklasyfikowane)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{movie_id}", response_model=MovieOut)
|
||||||
|
def get_movie(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> MovieOut:
|
||||||
|
movie = session.get(Movie, movie_id)
|
||||||
|
if movie is None:
|
||||||
|
raise HTTPException(status_code=404, detail="movie not found")
|
||||||
|
return _movie_to_out(session, movie)
|
||||||
|
|
||||||
|
|
||||||
|
def _movie_to_out(session: Session, movie: Movie) -> MovieOut:
|
||||||
|
studio_out: StudioOut | None = None
|
||||||
|
if movie.studio_id:
|
||||||
|
studio = session.get(Studio, movie.studio_id)
|
||||||
|
if studio is not None:
|
||||||
|
studio_out = StudioOut.model_validate(studio)
|
||||||
|
|
||||||
|
performer_rows = session.execute(
|
||||||
|
select(Performer, MoviePerformer.as_alias)
|
||||||
|
.join(MoviePerformer, MoviePerformer.performer_id == Performer.id)
|
||||||
|
.where(MoviePerformer.movie_id == movie.id)
|
||||||
|
.order_by(MoviePerformer.position.asc().nulls_last())
|
||||||
|
).all()
|
||||||
|
performers = [
|
||||||
|
PerformerOut(
|
||||||
|
id=p.id,
|
||||||
|
canonical_name=p.canonical_name,
|
||||||
|
slug=p.slug,
|
||||||
|
gender=p.gender.value if p.gender else None,
|
||||||
|
as_alias=alias,
|
||||||
|
)
|
||||||
|
for p, alias in performer_rows
|
||||||
|
]
|
||||||
|
|
||||||
|
tag_rows = session.execute(
|
||||||
|
select(Tag).join(MovieTag, MovieTag.tag_id == Tag.id)
|
||||||
|
.where(MovieTag.movie_id == movie.id)
|
||||||
|
.order_by(Tag.name.asc())
|
||||||
|
).scalars().all()
|
||||||
|
tags = [TagOut.model_validate(t) for t in tag_rows]
|
||||||
|
|
||||||
|
chapter_rows = session.execute(
|
||||||
|
select(MovieChapter).where(MovieChapter.movie_id == movie.id)
|
||||||
|
.order_by(MovieChapter.chapter_index.asc())
|
||||||
|
).scalars().all()
|
||||||
|
chapters = [MovieChapterOut.model_validate(c) for c in chapter_rows]
|
||||||
|
|
||||||
|
ref_rows = session.execute(
|
||||||
|
select(MovieExternalRef, Source.name)
|
||||||
|
.join(Source, Source.id == MovieExternalRef.source_id)
|
||||||
|
.where(MovieExternalRef.movie_id == movie.id)
|
||||||
|
).all()
|
||||||
|
external_refs = [
|
||||||
|
ExternalRefOut(
|
||||||
|
source=name,
|
||||||
|
external_id=ref.external_id,
|
||||||
|
url=ref.url,
|
||||||
|
last_seen=ref.last_seen,
|
||||||
|
)
|
||||||
|
for ref, name in ref_rows
|
||||||
|
]
|
||||||
|
|
||||||
|
pb_rows = session.execute(
|
||||||
|
select(MoviePlaybackSource)
|
||||||
|
.where(MoviePlaybackSource.movie_id == movie.id)
|
||||||
|
.where(MoviePlaybackSource.dead_at.is_(None))
|
||||||
|
.order_by(MoviePlaybackSource.created_at.desc())
|
||||||
|
).scalars().all()
|
||||||
|
pb_rows = [p for p in pb_rows if p.origin not in _MOVIE_DROP_ORIGINS]
|
||||||
|
# Bug-report 2026-05-16: raw landing origins (`mangoporn`/`pandamovies`/
|
||||||
|
# `streamporn` BEZ `:host`) otwierały WebView z reklamami pełnoekranowymi
|
||||||
|
# i myliły usera. Ukrywamy raw landing GDY ten sam movie ma co najmniej
|
||||||
|
# jeden sub-host entry (origin zawiera `:`). Jeśli movie nie ma sub-hosters
|
||||||
|
# (bo theme HTML się zmienił lub regex nie złapał), zostawiamy landing jako
|
||||||
|
# last-resort.
|
||||||
|
has_subhost = any(":" in p.origin for p in pb_rows)
|
||||||
|
if has_subhost:
|
||||||
|
pb_rows = [p for p in pb_rows if p.origin not in _MOVIE_LANDING_HIDE]
|
||||||
|
pb_rows = sorted(pb_rows, key=lambda p: _movie_origin_priority(p.origin))
|
||||||
|
playback_sources = [PlaybackSourceOut.model_validate(p) for p in pb_rows]
|
||||||
|
|
||||||
|
is_fav = session.get(FavoriteMovie, movie.id) is not None
|
||||||
|
|
||||||
|
return MovieOut(
|
||||||
|
id=movie.id,
|
||||||
|
title=movie.title,
|
||||||
|
slug=movie.slug,
|
||||||
|
release_year=movie.release_year,
|
||||||
|
release_date=movie.release_date,
|
||||||
|
duration_sec=movie.duration_sec,
|
||||||
|
description=movie.description,
|
||||||
|
director=movie.director,
|
||||||
|
country=movie.country,
|
||||||
|
rating=movie.rating,
|
||||||
|
poster_url=movie.poster_url,
|
||||||
|
backdrop_url=movie.backdrop_url,
|
||||||
|
studio=studio_out,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
chapters=chapters,
|
||||||
|
external_refs=external_refs,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
created_at=movie.created_at,
|
||||||
|
is_favorite=is_fav,
|
||||||
|
)
|
||||||
540
app/api/playback.py
Normal file
540
app/api/playback.py
Normal file
|
|
@ -0,0 +1,540 @@
|
||||||
|
"""POST /scenes/{scene_id}/playback/{playback_id}/resolve — rozwiązuje stream URL.
|
||||||
|
|
||||||
|
Mobile apka woła ten endpoint na klik "Watch" — backend ekstraktuje świeży
|
||||||
|
stream URL (m3u8/mp4) z page tube'a i zwraca go. Mobile otwiera URL przez
|
||||||
|
Linking.openURL → Android player chooser (MX Player / VLC / browser).
|
||||||
|
|
||||||
|
Stream URLs są podpisane/expire (zwykle ~kilka godzin) — nie cache'ujemy ich
|
||||||
|
w DB, tylko resolve on-demand. Logika ekstrakcji per-tube w `app.extractors`.
|
||||||
|
|
||||||
|
**Dead-link detection**: gdy hoster embed page mówi "Video deleted/not found",
|
||||||
|
oznaczamy `PlaybackSource.dead_at = now()` — API dalej go nie listuje, mobile
|
||||||
|
nie pokaże martwego buttonu.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from typing import Annotated, Any
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Request, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.schemas import PlaybackSourceOut
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.extractors import (
|
||||||
|
HosterDead,
|
||||||
|
StreamSource,
|
||||||
|
TubePageError,
|
||||||
|
extract_stream_from_hoster,
|
||||||
|
try_extract,
|
||||||
|
)
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/scenes", tags=["playback"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
# CDN-domain allowlist dla mobile direct fetch — token IS time-bound (nie IP-bound),
|
||||||
|
# zweryfikowane cross-IP curl test 2026-05-18. Mobile ExoPlayer pobiera manifest+segments
|
||||||
|
# bezpośrednio z CDN, **zero VPS bandwidth**. Critical dla public release (TB+/miesiąc).
|
||||||
|
#
|
||||||
|
# Verified time-bound:
|
||||||
|
# - xvideos-cdn.com, xnxx-cdn.com (WGCZ Holding) — signed token w path + exp_time
|
||||||
|
# - phncdn.com (pornhub), ypncdn.com (youporn), rdtcdn.com (redtube) — validfrom+validto+hash
|
||||||
|
# - privatehost.com (pornhat CDN) — sign + exp_time, brak Referer requirement
|
||||||
|
# - sxyprn.com — signed path
|
||||||
|
# - eporner.com CDN — IP literal w path ale CDN go ignoruje
|
||||||
|
#
|
||||||
|
# NIE w allowlist (IP-bound, wymagają proxy):
|
||||||
|
# - premilkyway.com (latestpornvideo) — 403 cross-IP
|
||||||
|
# - tnmr.org (mypornerleak) — 403 cross-IP
|
||||||
|
# - porntrex.com/get_file — single-use token (410 po reuse)
|
||||||
|
# - freshporno.org/get_file — cv= signed token IP-bound
|
||||||
|
# - sn.porn-xp.com, porn00.org — force_proxy explicit
|
||||||
|
_TIME_BOUND_CDN_RE = re.compile(
|
||||||
|
r"\b(?:"
|
||||||
|
r"xvideos-cdn|xnxx-cdn|phncdn|ypncdn|rdtcdn" # mainstream
|
||||||
|
r"|privatehost" # pornhat
|
||||||
|
r")\.[a-z]{2,4}"
|
||||||
|
r"|(?:^|/)(?:sxyprn\.com|[\w\-]+\.eporner\.com)/",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# IP-BOUND CDN signature — token bind do requester IP, cross-IP fetch = 403.
|
||||||
|
# Wymaga mobile WebView fallback (mobile extract z phone session, nie VPS).
|
||||||
|
# Shared KVS infrastructure across multiple hosters (luluvid movies, mypornerleak,
|
||||||
|
# latestpornvideo) — wszystkie używają tego samego CDN pool.
|
||||||
|
_IP_BOUND_CDN_RE = re.compile(
|
||||||
|
r"\b(?:"
|
||||||
|
r"premilkyway\.com" # latestpornvideo
|
||||||
|
r"|tnmr\.org" # mypornerleak legacy + luluvid movies (cdn-tnmr.org)
|
||||||
|
r"|acek-cdn\.com" # mypornerleak current
|
||||||
|
r")\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StreamLink(BaseModel):
|
||||||
|
"""Pojedynczy variant stream URL (różne quality / kontener).
|
||||||
|
|
||||||
|
`stream_url` = URL do video file (mp4/m3u8/webm) — proxy-wrapped URL przez backend
|
||||||
|
VPS (`/proxy/{token}/play.ext`). Bezpieczny fallback gdy CDN binduje URL do IP
|
||||||
|
extractora (np. fpo.xxx z kt_remote_ips cookie). Bandwidth idzie przez VPS.
|
||||||
|
|
||||||
|
`direct_url` + `headers` = surowy CDN URL z headers do bezpośredniego fetchu z
|
||||||
|
urządzenia. Większość tube CDN (xhamster, redtube, watchporn, eporner) zwraca
|
||||||
|
poprawnie content gdy mobile player wysyła `Referer` + `User-Agent` z `headers`.
|
||||||
|
Mobile próbuje direct PIERWSZY — gdy CDN zwróci 403/410 (IP-bound), spada na
|
||||||
|
`stream_url` (proxy). Daje 0 bandwidth na VPS-ie dla większości scen.
|
||||||
|
|
||||||
|
`embed_url` = URL do embed/hoster page (HTML, np. StreamWish, doodporn) — mobile
|
||||||
|
otwiera w WebView. Type: 'hoster'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
stream_url: str | None = None
|
||||||
|
embed_url: str | None = None
|
||||||
|
direct_url: str | None = None
|
||||||
|
headers: dict[str, str] | None = None
|
||||||
|
quality: str | None = None
|
||||||
|
type: str | None = None # mime/ext, np. 'video/mp4', 'application/x-mpegURL'
|
||||||
|
raw: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ResolveOut(BaseModel):
|
||||||
|
source: PlaybackSourceOut
|
||||||
|
best: StreamLink | None = None
|
||||||
|
links: list[StreamLink] = []
|
||||||
|
|
||||||
|
|
||||||
|
movies_router = APIRouter(
|
||||||
|
prefix="/movies", tags=["movies-playback"], dependencies=[Depends(require_api_key)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@movies_router.post("/{movie_id}/playback/{playback_id}/resolve", response_model=ResolveOut)
|
||||||
|
def resolve_movie_playback(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
playback_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> ResolveOut:
|
||||||
|
"""Movies playback resolve — analog do `/scenes/{id}/playback/{pb}/resolve`.
|
||||||
|
|
||||||
|
Origin patterns:
|
||||||
|
- 'paradisehill' → tylko page_url (Yii2 player wymaga login session, więc
|
||||||
|
mobile dostaje page_url jako embed_url, otwiera w WebView).
|
||||||
|
- 'mangoporn:host', 'streamporn:host', 'pandamovies:host' → embed_url to URL
|
||||||
|
embedu hostera (doodstream/lulustream/rpmplay/itp.). Próbujemy lokalnie
|
||||||
|
wyciągnąć direct stream URL przez generic packer (`extract_stream_from_hoster`),
|
||||||
|
z fallback na embed-only gdy się nie uda. Mobile w PlayerScreen.WebViewMode
|
||||||
|
wyciągnie wtedy URL JS-em (jak ze scenami).
|
||||||
|
"""
|
||||||
|
from app.models.movie_playback_source import MoviePlaybackSource
|
||||||
|
|
||||||
|
pb = session.get(MoviePlaybackSource, playback_id)
|
||||||
|
if pb is None or pb.movie_id != movie_id:
|
||||||
|
raise HTTPException(status_code=404, detail="movie playback source not found")
|
||||||
|
if pb.dead_at is not None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=410,
|
||||||
|
detail=f"playback dead: {pb.dead_reason or 'unknown'}",
|
||||||
|
)
|
||||||
|
|
||||||
|
referer = pb.page_url
|
||||||
|
links: list[StreamLink] = []
|
||||||
|
|
||||||
|
if pb.origin == "paradisehill":
|
||||||
|
# Tylko WebView fallback — paradisehill player wymaga session login dla streamu.
|
||||||
|
links = [
|
||||||
|
StreamLink(
|
||||||
|
stream_url=None,
|
||||||
|
embed_url=pb.page_url,
|
||||||
|
quality=pb.quality,
|
||||||
|
type="hoster",
|
||||||
|
raw={"origin": pb.origin},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# dooplay mirror sources: spróbuj direct stream extract z hoster URL
|
||||||
|
target = pb.embed_url or pb.page_url
|
||||||
|
stream: str | None = None
|
||||||
|
try:
|
||||||
|
stream = extract_stream_from_hoster(target, referer=referer)
|
||||||
|
except HosterDead as e:
|
||||||
|
# Hoster wprost mówi "video deleted" — oznacz dead, NIE proponuj
|
||||||
|
# embed fallback (mobile ExoPlayer dostałby 404 HTML page i
|
||||||
|
# próbowałby zapisać jako .bin file; bug-report 2026-05-16
|
||||||
|
# "streamtape ściąga hurtowo pliki .bin").
|
||||||
|
pb.dead_at = datetime.now(UTC)
|
||||||
|
pb.dead_reason = str(e)[:512]
|
||||||
|
session.commit()
|
||||||
|
log.info("marked movie playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, e)
|
||||||
|
raise HTTPException(status_code=410, detail=f"playback dead: {e}") from e
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("movie hoster extract failed for %s: %s", target, e)
|
||||||
|
if stream and _IP_BOUND_CDN_RE.search(stream):
|
||||||
|
# IP-bound CDN (luluvid → cdn-tnmr.org, etc.) — token bind do VPS IP,
|
||||||
|
# mobile direct = 403. Skip stream, fallback na embed_url (mobile WebView).
|
||||||
|
log.info(
|
||||||
|
"movie playback %s: stream URL IP-bound CDN — skip, WebView fallback",
|
||||||
|
pb.id,
|
||||||
|
)
|
||||||
|
stream = None
|
||||||
|
if stream:
|
||||||
|
type_hint = "m3u8" if ".m3u8" in stream.lower() else "mp4"
|
||||||
|
# Hostery których CDN wymaga Chrome JA3 (mxcontent dla mixdrop):
|
||||||
|
# proxy MUSI użyć curl_cffi impersonate inaczej 403. `proxy_impersonate=True`
|
||||||
|
# idzie przez `raw` → `_proxify_link` ustawi token `i=1`.
|
||||||
|
cdn_needs_impersonate = "mxcontent.net" in stream.lower()
|
||||||
|
raw_meta: dict = {"origin": pb.origin, "host": target}
|
||||||
|
if cdn_needs_impersonate:
|
||||||
|
raw_meta["proxy_impersonate"] = True
|
||||||
|
# Mixdrop: same-session cookies + chrome JA3 wymagane dla mp4.
|
||||||
|
# Backend extract zamknął sesję — proxy musi re-fetchować
|
||||||
|
# embed page w fresh curl_cffi session żeby re-extract mp4
|
||||||
|
# z aktualnymi cookies.
|
||||||
|
raw_meta["refetch_url"] = target
|
||||||
|
raw_meta["refetch_hoster"] = "mixdrop"
|
||||||
|
links.append(
|
||||||
|
StreamLink(
|
||||||
|
stream_url=stream,
|
||||||
|
embed_url=None,
|
||||||
|
quality=pb.quality,
|
||||||
|
type=type_hint,
|
||||||
|
raw=raw_meta,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Zawsze dorzucamy embed jako fallback — mobile WebView może wyłapać URL z JS-a
|
||||||
|
if pb.embed_url:
|
||||||
|
links.append(
|
||||||
|
StreamLink(
|
||||||
|
stream_url=None,
|
||||||
|
embed_url=pb.embed_url,
|
||||||
|
quality=pb.quality,
|
||||||
|
type="hoster",
|
||||||
|
raw={"origin": pb.origin},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not links:
|
||||||
|
raise HTTPException(status_code=502, detail="no playable links")
|
||||||
|
|
||||||
|
links = [_proxify_link(link, referer) for link in links]
|
||||||
|
best = _pick_best(links) if links else None
|
||||||
|
return ResolveOut(
|
||||||
|
source=PlaybackSourceOut.model_validate(pb),
|
||||||
|
best=best,
|
||||||
|
links=links,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _requester_tag(request: Request) -> str:
|
||||||
|
"""Audit tag dla mark-dead: IP (X-Forwarded-For preferred dla nginx proxy)
|
||||||
|
+ skrócony User-Agent. Zapisywane w dead_reason + log dla post-mortem
|
||||||
|
gdyby leaked APK key był używany do masowego psucia danych."""
|
||||||
|
fwd = request.headers.get("x-forwarded-for", "")
|
||||||
|
ip = fwd.split(",")[0].strip() if fwd else (request.client.host if request.client else "?")
|
||||||
|
ua = (request.headers.get("user-agent") or "")[:40]
|
||||||
|
return f"ip={ip} ua={ua}"
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/{scene_id}/playback/{playback_id}/mark-dead",
|
||||||
|
status_code=status.HTTP_204_NO_CONTENT,
|
||||||
|
)
|
||||||
|
def mark_playback_dead(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
playback_id: uuid.UUID,
|
||||||
|
request: Request,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
"""User-triggered mark dead — long-press na playback chip w mobile.
|
||||||
|
|
||||||
|
Bug-report 2026-05-12 (dd17c709): "Eporner to nie temporary. Więc długie
|
||||||
|
przytrzymanie na linku celem usunięcia też byłoby ok". Backend mark-dead
|
||||||
|
flow działał tylko dla resolve failures (HosterDead/TubePageError). User
|
||||||
|
może teraz oznaczać linki które działają backendowi (200 OK) ale są broken
|
||||||
|
w praktyce (np. źle zmatchowana scena, ad-redirect, hoster zwraca placeholder).
|
||||||
|
|
||||||
|
Audit: zapisujemy requester IP+UA w dead_reason+log żeby leaked APK key
|
||||||
|
nie mógł silently masowo niszczyć danych bez ścieżki dochodzenia.
|
||||||
|
"""
|
||||||
|
pb = session.get(PlaybackSource, playback_id)
|
||||||
|
if pb is None or pb.scene_id != scene_id:
|
||||||
|
raise HTTPException(status_code=404, detail="playback source not found for scene")
|
||||||
|
if pb.dead_at is None:
|
||||||
|
tag = _requester_tag(request)
|
||||||
|
pb.dead_at = datetime.now(UTC)
|
||||||
|
pb.dead_reason = f"user-marked dead (mobile long-press) {tag}"[:512]
|
||||||
|
session.commit()
|
||||||
|
log.info("user marked playback %s dead (origin=%s %s)", pb.id, pb.origin, tag)
|
||||||
|
|
||||||
|
|
||||||
|
@movies_router.post(
|
||||||
|
"/{movie_id}/playback/{playback_id}/mark-dead",
|
||||||
|
status_code=status.HTTP_204_NO_CONTENT,
|
||||||
|
)
|
||||||
|
def mark_movie_playback_dead(
|
||||||
|
movie_id: uuid.UUID,
|
||||||
|
playback_id: uuid.UUID,
|
||||||
|
request: Request,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
"""User-triggered mark dead dla movie playback (long-press w MovieDetail)."""
|
||||||
|
from app.models.movie_playback_source import MoviePlaybackSource
|
||||||
|
|
||||||
|
pb = session.get(MoviePlaybackSource, playback_id)
|
||||||
|
if pb is None or pb.movie_id != movie_id:
|
||||||
|
raise HTTPException(status_code=404, detail="movie playback source not found")
|
||||||
|
if pb.dead_at is None:
|
||||||
|
tag = _requester_tag(request)
|
||||||
|
pb.dead_at = datetime.now(UTC)
|
||||||
|
pb.dead_reason = f"user-marked dead (mobile long-press) {tag}"[:512]
|
||||||
|
session.commit()
|
||||||
|
log.info("user marked movie playback %s dead (origin=%s %s)", pb.id, pb.origin, tag)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{scene_id}/playback/{playback_id}/resolve", response_model=ResolveOut)
|
||||||
|
def resolve_playback(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
playback_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> ResolveOut:
|
||||||
|
pb = session.get(PlaybackSource, playback_id)
|
||||||
|
if pb is None or pb.scene_id != scene_id:
|
||||||
|
raise HTTPException(status_code=404, detail="playback source not found for scene")
|
||||||
|
if pb.dead_at is not None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=410,
|
||||||
|
detail=f"playback source marked dead: {pb.dead_reason or 'unknown'}",
|
||||||
|
)
|
||||||
|
|
||||||
|
page_url = pb.page_url
|
||||||
|
|
||||||
|
sitetag: str | None = None
|
||||||
|
if pb.origin.startswith("pornapp:"):
|
||||||
|
# Legacy origin format — pre-pornapp-removal migration. Po Fazie 2 zostanie tylko `tube:`.
|
||||||
|
sitetag = pb.origin.split(":", 1)[1]
|
||||||
|
elif pb.origin.startswith("tube:"):
|
||||||
|
sitetag = pb.origin.split(":", 1)[1]
|
||||||
|
|
||||||
|
if sitetag is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=501,
|
||||||
|
detail=f"resolve not implemented for origin '{pb.origin}'",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sources = try_extract(sitetag, page_url)
|
||||||
|
except HosterDead as e:
|
||||||
|
pb.dead_at = datetime.now(UTC)
|
||||||
|
pb.dead_reason = str(e)[:512]
|
||||||
|
session.commit()
|
||||||
|
log.info("marked playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, e)
|
||||||
|
raise HTTPException(status_code=410, detail=f"playback dead: {e}") from e
|
||||||
|
except TubePageError as e:
|
||||||
|
# Tube page is gone (404/410) — mark dead, propagate as 410. Inne 5xx → 502.
|
||||||
|
if e.status_code in (404, 410):
|
||||||
|
reason = f"tube page {e.status_code} {pb.page_url}"
|
||||||
|
pb.dead_at = datetime.now(UTC)
|
||||||
|
pb.dead_reason = reason[:512]
|
||||||
|
session.commit()
|
||||||
|
log.info("marked playback %s dead (origin=%s reason=%s)", pb.id, pb.origin, reason)
|
||||||
|
raise HTTPException(status_code=410, detail=f"playback dead: {reason}") from e
|
||||||
|
log.warning("tube fetch http error %s for %s", e.status_code, pb.page_url)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=502,
|
||||||
|
detail=f"tube fetch failed: HTTP {e.status_code}",
|
||||||
|
) from e
|
||||||
|
|
||||||
|
if not sources:
|
||||||
|
# Extractor None — TRANSIENT failure (network glitch, tube chwilowy 503,
|
||||||
|
# ad-network response zmieniony, race condition). NIE oznaczamy `dead_at`
|
||||||
|
# bo wcześniej powodowało false-positive permanent dead dla freshporno scen
|
||||||
|
# które działały przy następnym attempt (bug-report 2026-05-12).
|
||||||
|
#
|
||||||
|
# Permanent dead idzie TYLKO z explicit signals:
|
||||||
|
# - HosterDead exception (hoster page mówi "video deleted")
|
||||||
|
# - TubePageError 404/410 (page nie istnieje)
|
||||||
|
# Reszta = transient, mobile dostaje 501 → user może retry.
|
||||||
|
log.info(
|
||||||
|
"extractor None for playback %s (origin=%s) — transient, not marking dead",
|
||||||
|
pb.id, pb.origin,
|
||||||
|
)
|
||||||
|
# 503 (not 410!) żeby mobile NIE pokazało "Tube usunął ten film" — ten kod
|
||||||
|
# jest dla permanent removal. 503 = transient, user może retry.
|
||||||
|
# Sentry filtruje HTTPException 502/503/504 w `_sentry_before_send` (main.py) —
|
||||||
|
# bez tego GOON-3 spam-floodował issue list (16 events/5h dla expected case).
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="extraction failed temporarily — retry possible",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Per-source referer: niektóre extractory (yt-dlp, embed-iframe) zwracają stream
|
||||||
|
# URL którego CDN expectuje Referera embed page'a (host iframe), nie oryginalnej
|
||||||
|
# strony tube'a. Np. 0dayxx page → watchporn.to/embed iframe → stream URL chce
|
||||||
|
# `Referer: watchporn.to/` (z `Referer: 0dayxx.com` CDN zwraca 410). StreamSource.
|
||||||
|
# referer trzyma tę informację; fallback na page_url gdy extractor nie ustawił.
|
||||||
|
proxified: list[StreamLink] = []
|
||||||
|
for s in sources:
|
||||||
|
link = _stream_source_to_link(s)
|
||||||
|
proxified.append(_proxify_link(link, s.referer or page_url))
|
||||||
|
links = proxified
|
||||||
|
best = _pick_best(links) if links else None
|
||||||
|
return ResolveOut(
|
||||||
|
source=PlaybackSourceOut.model_validate(pb),
|
||||||
|
best=best,
|
||||||
|
links=links,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_PLAYER_UA = (
|
||||||
|
"Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Mobile Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _proxify_link(link: StreamLink, referer: str) -> StreamLink:
|
||||||
|
"""Wzbogaca StreamLink o:
|
||||||
|
- `stream_url`: proxy URL (fallback gdy direct fails)
|
||||||
|
- `direct_url`: surowy CDN URL (preferowany — 0 VPS bandwidth)
|
||||||
|
- `headers`: Referer + User-Agent dla direct fetch
|
||||||
|
Mobile player próbuje direct PIERWSZY, fallback na stream_url po błędzie 403/410.
|
||||||
|
"""
|
||||||
|
if not link.stream_url:
|
||||||
|
return link
|
||||||
|
from app.api.stream_proxy import make_token
|
||||||
|
|
||||||
|
raw_url = link.stream_url
|
||||||
|
# Extractor flags w raw:
|
||||||
|
# - proxy_impersonate: curl_cffi chrome JA3 (mxcontent etc.)
|
||||||
|
# - refetch_url: embed URL do re-extract gdy token expired (same-session
|
||||||
|
# cookie binding dla mixdrop). Bez tego mp4 token + brak cookies → 403.
|
||||||
|
use_impersonate = bool(link.raw and link.raw.get("proxy_impersonate"))
|
||||||
|
# force_proxy=True (extractor flag) → direct_url=proxied od razu. Dla CDN-ów
|
||||||
|
# gdzie token IS bound do VPS IP (porn00 v-acctoken, pornxp sv.porn-xp.com
|
||||||
|
# signed path) — mobile direct ZAWSZE 403, więc nie ma sensu próbować.
|
||||||
|
# Bez tego: każdy playback "mrugnie" (direct fail → fallback na proxy).
|
||||||
|
force_proxy = bool(link.raw and link.raw.get("force_proxy"))
|
||||||
|
# mobile_direct_ok=True (extractor flag) → m3u8 może iść direct do mobile bo
|
||||||
|
# CDN URL ma time-bound (nie IP-bound) signed token. Mobile ExoPlayer pobiera
|
||||||
|
# manifest+segments bezpośrednio z CDN, zero VPS bandwidth.
|
||||||
|
mobile_direct_ok = bool(link.raw and link.raw.get("mobile_direct_ok"))
|
||||||
|
# Auto-detect time-bound CDN po domain — bez per-extractor flag setting.
|
||||||
|
# Critical dla public release: wszystkie mainstream tubes (xvideos/xnxx/pornhub/
|
||||||
|
# youporn/redtube + pornhat) zwracają time-bound URLs które działają cross-IP.
|
||||||
|
if not mobile_direct_ok and raw_url and _TIME_BOUND_CDN_RE.search(raw_url):
|
||||||
|
mobile_direct_ok = True
|
||||||
|
refetch_url = (link.raw or {}).get("refetch_url")
|
||||||
|
refetch_hoster = (link.raw or {}).get("refetch_hoster")
|
||||||
|
token = make_token(
|
||||||
|
raw_url, referer, impersonate=use_impersonate,
|
||||||
|
refresh=refetch_url, refresh_hoster=refetch_hoster,
|
||||||
|
)
|
||||||
|
# Decyzja na BASIE link.type (zaufanie do extractora), z fallback path-hint.
|
||||||
|
# Pornhat: raw URL `.../get_file/.../<id>.mp4/` ale CDN 302 → HLS manifest.
|
||||||
|
# Extractor markuje type='m3u8' żeby ExoPlayer użył HlsMediaSource (bez tego
|
||||||
|
# path `.mp4` mylił player → "no extractors").
|
||||||
|
type_lower = (link.type or "").lower()
|
||||||
|
if type_lower in {"m3u8", "hls", "mpd"}:
|
||||||
|
ext = "m3u8" if type_lower in {"m3u8", "hls"} else "mpd"
|
||||||
|
elif ".m3u8" in raw_url.lower():
|
||||||
|
ext = "m3u8"
|
||||||
|
elif ".mpd" in raw_url.lower():
|
||||||
|
ext = "mpd"
|
||||||
|
else:
|
||||||
|
ext = "mp4"
|
||||||
|
proxied = f"/proxy/{token}/play.{ext}"
|
||||||
|
# `direct_url`: surowy CDN URL — mobile próbuje go PIERWSZY (0 VPS bandwidth).
|
||||||
|
# ALE: dla type=m3u8/hls/mpd manifest URL musi być rewritowany żeby segmenty/keys
|
||||||
|
# też leciały przez proxy (inne IP może też mieć rate limit/token issues), plus
|
||||||
|
# ExoPlayer wybiera extractor po URL extension — `.mp4` w direct URL pornhat
|
||||||
|
# → Mp4Extractor → fail bo content to HLS. Dla m3u8/mpd zwracamy proxied JAKO
|
||||||
|
# direct (mobile używa go bezpośrednio, 1 hop przez VPS ale to jedyny sposób
|
||||||
|
# żeby manifest+segments były spójne i ExoPlayer wybrał HlsMediaSource).
|
||||||
|
# Dla CDNs które wymagają chrome JA3 (mxcontent) direct_url też zawsze przez
|
||||||
|
# proxy — bez tego mobile direct fetch z OkHttp JA3 dostaje 403 → fallback proxy
|
||||||
|
# → extra round-trip + ExoPlayer "no extractors" przed retry.
|
||||||
|
# mobile_direct_ok overrides m3u8 default-to-proxy: gdy CDN ma time-bound token
|
||||||
|
# (nie IP-bound), mobile ExoPlayer może pobrać manifest direct bez VPS proxy.
|
||||||
|
is_manifest_type = type_lower in {"m3u8", "hls", "mpd"}
|
||||||
|
if use_impersonate or force_proxy or (is_manifest_type and not mobile_direct_ok):
|
||||||
|
direct_for_mobile = proxied
|
||||||
|
else:
|
||||||
|
direct_for_mobile = raw_url
|
||||||
|
return StreamLink(
|
||||||
|
stream_url=proxied,
|
||||||
|
embed_url=link.embed_url,
|
||||||
|
direct_url=direct_for_mobile,
|
||||||
|
headers={"Referer": referer, "User-Agent": DEFAULT_PLAYER_UA},
|
||||||
|
quality=link.quality,
|
||||||
|
type=link.type,
|
||||||
|
raw=link.raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_source_to_link(s: StreamSource) -> StreamLink:
|
||||||
|
"""Mapowanie StreamSource (z extractorów) na StreamLink (response API).
|
||||||
|
|
||||||
|
Hoster type → embed_url (mobile otworzy WebView). mp4/m3u8/mpd → stream_url
|
||||||
|
(mobile odtworzy w native playerze przez /proxy).
|
||||||
|
"""
|
||||||
|
is_hoster = (s.type or "").lower() == "hoster"
|
||||||
|
return StreamLink(
|
||||||
|
stream_url=None if is_hoster else s.link,
|
||||||
|
embed_url=s.link if is_hoster else None,
|
||||||
|
quality=s.quality,
|
||||||
|
type=s.type,
|
||||||
|
raw=s.raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_best(links: list[StreamLink]) -> StreamLink | None:
|
||||||
|
"""Wybiera najlepszą jakość. Preferencje:
|
||||||
|
1. Najpierw direct video (`stream_url` niepuste); fallback na embed-only gdy żaden
|
||||||
|
nie ma direct (mobile pokaże "Open in browser").
|
||||||
|
2. Najwyższe quality (parsowane jako int z '720p' / '1080p' / '4k')
|
||||||
|
3. Preferuj mp4 nad m3u8 jeśli ten sam quality (mp4 łatwiejsze dla MX Player)
|
||||||
|
"""
|
||||||
|
direct = [link for link in links if link.stream_url]
|
||||||
|
pool = direct or [link for link in links if link.embed_url]
|
||||||
|
if not pool:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def score(link: StreamLink) -> tuple[int, int]:
|
||||||
|
q_int = _quality_to_int(link.quality)
|
||||||
|
url_low = (link.stream_url or link.embed_url or "").lower()
|
||||||
|
type_low = (link.type or "").lower()
|
||||||
|
is_mp4 = ".mp4" in url_low or "mp4" in type_low or "direct" in type_low
|
||||||
|
type_priority = 1 if is_mp4 else 0
|
||||||
|
return (q_int, type_priority)
|
||||||
|
|
||||||
|
return max(pool, key=score)
|
||||||
|
|
||||||
|
|
||||||
|
_QUALITY_DIGITS_RE = re.compile(r"\d+")
|
||||||
|
|
||||||
|
|
||||||
|
def _quality_to_int(q: str | None) -> int:
|
||||||
|
"""Wyciąga liczbę pikseli z różnych formatów: '720p', '1080p Full HD', '4K', 'HD'."""
|
||||||
|
if not q:
|
||||||
|
return 0
|
||||||
|
s = q.lower().strip()
|
||||||
|
if "4k" in s or "uhd" in s:
|
||||||
|
return 2160
|
||||||
|
if "2k" in s or "qhd" in s:
|
||||||
|
return 1440
|
||||||
|
m = _QUALITY_DIGITS_RE.search(s)
|
||||||
|
if m:
|
||||||
|
return int(m.group(0))
|
||||||
|
if "fhd" in s:
|
||||||
|
return 1080
|
||||||
|
if "hd" in s:
|
||||||
|
return 720
|
||||||
|
if "sd" in s:
|
||||||
|
return 480
|
||||||
|
return 0
|
||||||
83
app/api/scene_favorites.py
Normal file
83
app/api/scene_favorites.py
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
"""Scene favorites — ulubione sceny (single-user, równolegle do /favorites/performers).
|
||||||
|
|
||||||
|
Endpointy:
|
||||||
|
GET /scene-favorites — lista ulubionych scen (pełen SceneOut)
|
||||||
|
POST /scene-favorites/{scene_id} — dodaj (idempotent)
|
||||||
|
DELETE /scene-favorites/{scene_id} — usuń
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.scenes import _build_scene_out
|
||||||
|
from app.api.schemas import SceneOut
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.favorite_scene import FavoriteScene
|
||||||
|
from app.models.scene import Scene
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/scene-favorites",
|
||||||
|
tags=["scene-favorites"],
|
||||||
|
dependencies=[Depends(require_api_key)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SceneFavoriteListOut(BaseModel):
|
||||||
|
items: list[SceneOut]
|
||||||
|
total: int
|
||||||
|
|
||||||
|
|
||||||
|
class SceneFavoriteToggleOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
favorited: bool
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=SceneFavoriteListOut)
|
||||||
|
def list_scene_favorites(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SceneFavoriteListOut:
|
||||||
|
rows = (
|
||||||
|
session.execute(
|
||||||
|
select(Scene, FavoriteScene)
|
||||||
|
.join(FavoriteScene, FavoriteScene.scene_id == Scene.id)
|
||||||
|
.order_by(FavoriteScene.created_at.desc())
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
items = [_build_scene_out(session, scene) for scene, _ in rows]
|
||||||
|
return SceneFavoriteListOut(items=items, total=len(items))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/{scene_id}",
|
||||||
|
response_model=SceneFavoriteToggleOut,
|
||||||
|
status_code=status.HTTP_201_CREATED,
|
||||||
|
)
|
||||||
|
def add_scene_favorite(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SceneFavoriteToggleOut:
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
existing = session.get(FavoriteScene, scene_id)
|
||||||
|
if existing is None:
|
||||||
|
session.add(FavoriteScene(scene_id=scene_id))
|
||||||
|
return SceneFavoriteToggleOut(scene_id=scene_id, favorited=True)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{scene_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_scene_favorite(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
fav = session.get(FavoriteScene, scene_id)
|
||||||
|
if fav is not None:
|
||||||
|
session.delete(fav)
|
||||||
960
app/api/scenes.py
Normal file
960
app/api/scenes.py
Normal file
|
|
@ -0,0 +1,960 @@
|
||||||
|
"""GET /scenes — lista i szczegóły scen z bazy kanonicznej."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy import distinct, exists, func, select
|
||||||
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
|
||||||
|
from app.api.schemas import (
|
||||||
|
ExternalRefOut,
|
||||||
|
PerformerOut,
|
||||||
|
PlaybackSourceOut,
|
||||||
|
SceneListOut,
|
||||||
|
SceneOut,
|
||||||
|
StudioOut,
|
||||||
|
TagOut,
|
||||||
|
)
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.favorite_scene import FavoriteScene
|
||||||
|
from app.models.performer import Performer
|
||||||
|
from app.models.play_progress import ScenePlayProgress
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import Scene, SceneExternalRef, ScenePerformer, SceneTag
|
||||||
|
from app.models.source import Source, SourceKind
|
||||||
|
from app.models.studio import Studio
|
||||||
|
from app.models.tag import Tag
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
|
||||||
|
_VALID_SORTS = {"created_at", "release_date", "title", "studio"}
|
||||||
|
|
||||||
|
|
||||||
|
def _split_csv(raw: str | None) -> list[str]:
|
||||||
|
if not raw:
|
||||||
|
return []
|
||||||
|
return [s.strip() for s in raw.split(",") if s.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=SceneListOut)
|
||||||
|
def list_scenes(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
q: str | None = Query(default=None, description="Wyszukiwanie po title_normalized (trgm)"),
|
||||||
|
studio_slug: str | None = Query(default=None, description="DEPRECATED — użyj studio_slugs"),
|
||||||
|
studio_slugs: str | None = Query(
|
||||||
|
default=None, description="Comma-separated studio slugs (OR)"
|
||||||
|
),
|
||||||
|
tags: str | None = Query(
|
||||||
|
default=None,
|
||||||
|
description="Comma-separated tag slugs (AND — scena musi mieć wszystkie wybrane tagi)",
|
||||||
|
),
|
||||||
|
performer_ids: str | None = Query(
|
||||||
|
default=None,
|
||||||
|
description="Comma-separated performer UUIDs (AND — scena musi mieć wszystkich wybranych performerów)",
|
||||||
|
),
|
||||||
|
has_playback: bool | None = Query(
|
||||||
|
default=None, description="True: tylko sceny z ≥1 playback_source"
|
||||||
|
),
|
||||||
|
has_animated_thumbnail: bool | None = Query(
|
||||||
|
default=None,
|
||||||
|
description="True: tylko sceny z ≥1 playback_source z animated_thumbnail_url (hold-to-preview)",
|
||||||
|
),
|
||||||
|
min_duration_sec: int | None = Query(default=None, ge=0),
|
||||||
|
max_duration_sec: int | None = Query(default=None, ge=0),
|
||||||
|
released_within_days: int | None = Query(
|
||||||
|
default=None, ge=1,
|
||||||
|
description="Tylko sceny released w ostatnich N dniach",
|
||||||
|
),
|
||||||
|
min_quality_p: int | None = Query(
|
||||||
|
default=None, ge=1,
|
||||||
|
description=(
|
||||||
|
"Minimum quality (pixele wysokości — 2160 = 4K, 1080 = FullHD). Filtruje "
|
||||||
|
"po PlaybackSource.quality (string typu '720p' / '1080p Full HD')."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
include_stubs: bool = Query(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"False (default): ukrywa sceny-szkielety bez release_date, < 10min, "
|
||||||
|
"z jedynym playback z hqporner (~7-min Brazzers trailer clipy zalewają katalog)."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
sort: str = Query(default="created_at", description="created_at|release_date|title|studio"),
|
||||||
|
page: int = Query(default=1, ge=1),
|
||||||
|
per_page: int = Query(default=50, ge=1, le=200),
|
||||||
|
) -> SceneListOut:
|
||||||
|
if sort not in _VALID_SORTS:
|
||||||
|
raise HTTPException(status_code=400, detail=f"sort must be one of {sorted(_VALID_SORTS)}")
|
||||||
|
|
||||||
|
base = select(Scene)
|
||||||
|
|
||||||
|
if q:
|
||||||
|
base = base.where(Scene.title_normalized.ilike(f"%{q.lower()}%"))
|
||||||
|
|
||||||
|
studio_slug_list = _split_csv(studio_slugs)
|
||||||
|
if studio_slug:
|
||||||
|
studio_slug_list.append(studio_slug)
|
||||||
|
if studio_slug_list:
|
||||||
|
base = base.where(
|
||||||
|
Scene.studio_id.in_(
|
||||||
|
select(Studio.id).where(Studio.slug.in_(studio_slug_list))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
tag_slug_list = _split_csv(tags)
|
||||||
|
# AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
|
||||||
|
# exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
|
||||||
|
for slug in tag_slug_list:
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1)
|
||||||
|
.select_from(SceneTag)
|
||||||
|
.join(Tag, Tag.id == SceneTag.tag_id)
|
||||||
|
.where(SceneTag.scene_id == Scene.id, Tag.slug == slug)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
perf_id_strings = _split_csv(performer_ids)
|
||||||
|
if perf_id_strings:
|
||||||
|
try:
|
||||||
|
perf_ids = [uuid.UUID(s) for s in perf_id_strings]
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=f"invalid performer UUID: {e}") from e
|
||||||
|
# AND między performerami (analogicznie do tagów).
|
||||||
|
for pid in perf_ids:
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1)
|
||||||
|
.select_from(ScenePerformer)
|
||||||
|
.where(
|
||||||
|
ScenePerformer.scene_id == Scene.id,
|
||||||
|
ScenePerformer.performer_id == pid,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if has_playback is True:
|
||||||
|
# Tylko sceny z choć jednym ŻYWYM playback_source.
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif has_playback is False:
|
||||||
|
base = base.where(
|
||||||
|
~exists(
|
||||||
|
select(1).where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Blacklisty — globalne wykluczenia. Jeśli scena ma JAKIEGOKOLWIEK blacklisted
|
||||||
|
# performera, jest na blacklisted studio, lub ma JAKIKOLWIEK blacklisted tag → out.
|
||||||
|
from app.models.blacklist import (
|
||||||
|
BlacklistedPerformer,
|
||||||
|
BlacklistedStudio,
|
||||||
|
BlacklistedTag,
|
||||||
|
)
|
||||||
|
base = base.where(
|
||||||
|
~exists(
|
||||||
|
select(1)
|
||||||
|
.select_from(ScenePerformer)
|
||||||
|
.join(BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id)
|
||||||
|
.where(ScenePerformer.scene_id == Scene.id)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
base = base.where(
|
||||||
|
~Scene.studio_id.in_(select(BlacklistedStudio.studio_id))
|
||||||
|
)
|
||||||
|
base = base.where(
|
||||||
|
~exists(
|
||||||
|
select(1)
|
||||||
|
.select_from(SceneTag)
|
||||||
|
.join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id)
|
||||||
|
.where(SceneTag.scene_id == Scene.id)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if has_animated_thumbnail:
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.animated_thumbnail_url.isnot(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if min_duration_sec is not None:
|
||||||
|
base = base.where(Scene.duration_sec >= min_duration_sec)
|
||||||
|
if max_duration_sec is not None:
|
||||||
|
base = base.where(Scene.duration_sec <= max_duration_sec)
|
||||||
|
|
||||||
|
if released_within_days is not None:
|
||||||
|
from datetime import date, timedelta
|
||||||
|
cutoff = date.today() - timedelta(days=released_within_days)
|
||||||
|
base = base.where(Scene.release_date >= cutoff)
|
||||||
|
|
||||||
|
if min_quality_p is not None:
|
||||||
|
# PlaybackSource.quality to wolny string — szukamy liczb w prefixie ('1080p',
|
||||||
|
# '1080p Full HD', '2160p'). Heurystyka: wystarczy że scena ma JEDEN żywy
|
||||||
|
# playback z quality liczbą >= min. '4K'/'UHD' aliasujemy na 2160.
|
||||||
|
from sqlalchemy import Integer, cast, or_
|
||||||
|
numeric_q = cast(
|
||||||
|
func.coalesce(func.substring(PlaybackSource.quality, r"\d+"), "0"),
|
||||||
|
Integer,
|
||||||
|
)
|
||||||
|
conds = [numeric_q >= min_quality_p]
|
||||||
|
if min_quality_p <= 2160:
|
||||||
|
conds.append(PlaybackSource.quality.ilike("%4k%"))
|
||||||
|
conds.append(PlaybackSource.quality.ilike("%uhd%"))
|
||||||
|
base = base.where(
|
||||||
|
exists(
|
||||||
|
select(1).where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.quality.isnot(None),
|
||||||
|
or_(*conds),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not include_stubs:
|
||||||
|
# Stub scene heuristic: tube-only scena BEZ release_date AND BEZ canonical
|
||||||
|
# (TPDB/StashDB) ref AND BEZ żadnego ScenePerformer linka. ScenePerformer
|
||||||
|
# dodaje continuous worker (search-by-name → wymusza link), więc per-performer
|
||||||
|
# search-result NIGDY nie jest stub. To filtruje tylko anonymous tube-only
|
||||||
|
# sceny z newUrl/categories ingestu które nie zostały zsyntowane z performerem.
|
||||||
|
canonical_exists = exists(
|
||||||
|
select(1)
|
||||||
|
.select_from(SceneExternalRef)
|
||||||
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
||||||
|
.where(SceneExternalRef.scene_id == Scene.id)
|
||||||
|
.where(Source.kind.in_([SourceKind.tpdb, SourceKind.stashdb]))
|
||||||
|
)
|
||||||
|
has_performer = exists(
|
||||||
|
select(1).where(ScenePerformer.scene_id == Scene.id)
|
||||||
|
)
|
||||||
|
# NOT stub gdy: ma canonical_ref OR ma release_date OR ma performera
|
||||||
|
base = base.where(
|
||||||
|
Scene.release_date.is_not(None) | canonical_exists | has_performer
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count: dla dużych baz (~400k scen) pełny count z 3 nested EXISTS bierze ~5s.
|
||||||
|
# Liczymy total na uproszczonym query (bez stub-filter w count) — daje ~5% off
|
||||||
|
# ale jest akceptowalne dla user-facing pagination header. Items query NADAL
|
||||||
|
# ma stub-filter, więc lista pokazuje poprawne sceny. Liczba w header jest
|
||||||
|
# przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie.
|
||||||
|
if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings:
|
||||||
|
# Fast path: typowy default request (lista bez filtra) — count tylko po
|
||||||
|
# has_playback (single EXISTS, dobrze zindeksowany).
|
||||||
|
count_query = select(func.count()).select_from(
|
||||||
|
select(Scene.id).where(
|
||||||
|
exists(
|
||||||
|
select(1).where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).subquery()
|
||||||
|
)
|
||||||
|
total = session.execute(count_query).scalar_one()
|
||||||
|
else:
|
||||||
|
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
|
||||||
|
|
||||||
|
# Sort: zawsze tie-break po created_at desc dla determinizmu paginacji.
|
||||||
|
if sort == "release_date":
|
||||||
|
ordered = base.order_by(
|
||||||
|
Scene.release_date.desc().nullslast(), Scene.created_at.desc()
|
||||||
|
)
|
||||||
|
elif sort == "title":
|
||||||
|
ordered = base.order_by(Scene.title_normalized.asc(), Scene.created_at.desc())
|
||||||
|
elif sort == "studio":
|
||||||
|
# Sceny bez studio na końcu; w obrębie studio — najświeższe pierwsze.
|
||||||
|
ordered = (
|
||||||
|
base.outerjoin(Studio, Studio.id == Scene.studio_id)
|
||||||
|
.order_by(
|
||||||
|
Studio.name_normalized.asc().nullslast(),
|
||||||
|
Scene.release_date.desc().nullslast(),
|
||||||
|
Scene.created_at.desc(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else: # created_at
|
||||||
|
ordered = base.order_by(
|
||||||
|
Scene.created_at.desc(), Scene.release_date.desc().nullslast()
|
||||||
|
)
|
||||||
|
|
||||||
|
rows = (
|
||||||
|
session.execute(ordered.offset((page - 1) * per_page).limit(per_page))
|
||||||
|
.scalars()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
items = _build_scenes_out_batch(session, list(rows))
|
||||||
|
|
||||||
|
return SceneListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{scene_id}", response_model=SceneOut)
|
||||||
|
def get_scene(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> SceneOut:
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
return _build_scene_out(session, scene)
|
||||||
|
|
||||||
|
|
||||||
|
def _needs_proxy(url: str) -> bool:
|
||||||
|
"""Wszystkie thumbnaile z playback_sources są proxowane przez backend.
|
||||||
|
Większość CDN-ów porn-tube'ów wymaga Refera (hqporner, mypornerleak/58img,
|
||||||
|
inne sxyprn/eporner CDN-y) — expo-image nie wysyła Referera.
|
||||||
|
Self-hosted lub backend-internal URL-e (zaczynające się od `/`) skipujemy."""
|
||||||
|
return url.startswith("http") and not url.startswith("/proxy/")
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_image_proxy(url: str, referer: str) -> str:
|
||||||
|
"""Wraps a thumbnail URL through /proxy/img/{token}/img.jpg. Klient nie musi
|
||||||
|
znać sekretu Referer — backend wstawi sam. Long TTL (30d) bo thumby
|
||||||
|
są stabilne, krótkie ttl by tylko niepotrzebnie zaśmiecało cache."""
|
||||||
|
from app.api.stream_proxy import make_token
|
||||||
|
token = make_token(url, referer, ttl_sec=30 * 24 * 3600)
|
||||||
|
# Path zachowuje rozszerzenie żeby HTTP Content-Type był rozpoznany.
|
||||||
|
import os as _os
|
||||||
|
ext = _os.path.splitext(url.split("?")[0])[1].lstrip(".") or "jpg"
|
||||||
|
return f"/proxy/img/{token}/img.{ext}"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_scenes_out_batch(session: Session, scenes: list[Scene]) -> list[SceneOut]:
|
||||||
|
"""Batch-fetch wszystkich relacji dla N scen w 7 zapytaniach (zamiast 7×N).
|
||||||
|
|
||||||
|
Eliminuje N+1 z `_build_scene_out` w listach scen — `/scenes?per_page=24` szło
|
||||||
|
z ~9.6s do <500ms. Pojedyncza scena (`/scenes/{id}`) nadal używa `_build_scene_out`
|
||||||
|
bo overhead na batch nie ma sensu dla N=1.
|
||||||
|
"""
|
||||||
|
from collections import defaultdict
|
||||||
|
if not scenes:
|
||||||
|
return []
|
||||||
|
|
||||||
|
scene_ids = [s.id for s in scenes]
|
||||||
|
studio_ids = list({s.studio_id for s in scenes if s.studio_id is not None})
|
||||||
|
|
||||||
|
# 1) Studios
|
||||||
|
studios_by_id: dict = {}
|
||||||
|
if studio_ids:
|
||||||
|
for st in session.execute(
|
||||||
|
select(Studio).where(Studio.id.in_(studio_ids))
|
||||||
|
).scalars():
|
||||||
|
studios_by_id[st.id] = st
|
||||||
|
|
||||||
|
# 2) Performers
|
||||||
|
perf_rows = session.execute(
|
||||||
|
select(ScenePerformer, Performer)
|
||||||
|
.join(Performer, Performer.id == ScenePerformer.performer_id)
|
||||||
|
.where(ScenePerformer.scene_id.in_(scene_ids))
|
||||||
|
.order_by(ScenePerformer.position.asc().nullslast())
|
||||||
|
).all()
|
||||||
|
performers_by_scene: dict = defaultdict(list)
|
||||||
|
for sp, p in perf_rows:
|
||||||
|
performers_by_scene[sp.scene_id].append(
|
||||||
|
PerformerOut(
|
||||||
|
id=p.id,
|
||||||
|
canonical_name=p.canonical_name,
|
||||||
|
slug=p.slug,
|
||||||
|
gender=p.gender.value if p.gender else None,
|
||||||
|
as_alias=sp.as_alias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3) Tags
|
||||||
|
tag_rows = session.execute(
|
||||||
|
select(SceneTag.scene_id, Tag)
|
||||||
|
.join(Tag, Tag.id == SceneTag.tag_id)
|
||||||
|
.where(SceneTag.scene_id.in_(scene_ids))
|
||||||
|
).all()
|
||||||
|
tags_by_scene: dict = defaultdict(list)
|
||||||
|
for sid, t in tag_rows:
|
||||||
|
tags_by_scene[sid].append(TagOut.model_validate(t))
|
||||||
|
|
||||||
|
# 4) External refs + sources
|
||||||
|
ref_rows = session.execute(
|
||||||
|
select(SceneExternalRef, Source)
|
||||||
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
||||||
|
.where(SceneExternalRef.scene_id.in_(scene_ids))
|
||||||
|
).all()
|
||||||
|
refs_by_scene: dict = defaultdict(list)
|
||||||
|
for ref, src in ref_rows:
|
||||||
|
refs_by_scene[ref.scene_id].append(
|
||||||
|
ExternalRefOut(
|
||||||
|
source=src.name,
|
||||||
|
external_id=ref.external_id,
|
||||||
|
url=ref.url,
|
||||||
|
last_seen=ref.last_seen,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5) Playback sources
|
||||||
|
pb_rows = session.execute(
|
||||||
|
select(PlaybackSource)
|
||||||
|
.where(
|
||||||
|
PlaybackSource.scene_id.in_(scene_ids),
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
.order_by(PlaybackSource.origin.asc())
|
||||||
|
).scalars().all()
|
||||||
|
pb_by_scene: dict = defaultdict(list)
|
||||||
|
for p in pb_rows:
|
||||||
|
out = PlaybackSourceOut.model_validate(p)
|
||||||
|
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
|
||||||
|
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
|
||||||
|
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
|
||||||
|
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
|
||||||
|
pb_by_scene[p.scene_id].append(out)
|
||||||
|
|
||||||
|
# 6) Progress
|
||||||
|
progress_by_scene: dict = {}
|
||||||
|
for prog in session.execute(
|
||||||
|
select(ScenePlayProgress).where(ScenePlayProgress.scene_id.in_(scene_ids))
|
||||||
|
).scalars():
|
||||||
|
progress_by_scene[prog.scene_id] = prog
|
||||||
|
|
||||||
|
# 7) Favorites
|
||||||
|
fav_scene_ids: set = set(
|
||||||
|
session.execute(
|
||||||
|
select(FavoriteScene.scene_id).where(
|
||||||
|
FavoriteScene.scene_id.in_(scene_ids)
|
||||||
|
)
|
||||||
|
).scalars()
|
||||||
|
)
|
||||||
|
|
||||||
|
out: list[SceneOut] = []
|
||||||
|
for scene in scenes:
|
||||||
|
studio_out = None
|
||||||
|
if scene.studio_id is not None and scene.studio_id in studios_by_id:
|
||||||
|
studio_out = StudioOut.model_validate(studios_by_id[scene.studio_id])
|
||||||
|
progress = progress_by_scene.get(scene.id)
|
||||||
|
out.append(
|
||||||
|
SceneOut(
|
||||||
|
id=scene.id,
|
||||||
|
title=scene.title,
|
||||||
|
slug=scene.slug,
|
||||||
|
release_date=scene.release_date,
|
||||||
|
duration_sec=scene.duration_sec,
|
||||||
|
description=scene.description,
|
||||||
|
code=scene.code,
|
||||||
|
director=scene.director,
|
||||||
|
studio=studio_out,
|
||||||
|
performers=performers_by_scene.get(scene.id, []),
|
||||||
|
tags=tags_by_scene.get(scene.id, []),
|
||||||
|
external_refs=refs_by_scene.get(scene.id, []),
|
||||||
|
playback_sources=pb_by_scene.get(scene.id, []),
|
||||||
|
created_at=scene.created_at,
|
||||||
|
last_played_at=progress.last_played_at if progress else None,
|
||||||
|
finished=progress.finished if progress else False,
|
||||||
|
position_sec=progress.position_sec if progress else 0,
|
||||||
|
is_favorite=scene.id in fav_scene_ids,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _build_scene_out(session: Session, scene: Scene) -> SceneOut:
|
||||||
|
studio_out: StudioOut | None = None
|
||||||
|
if scene.studio_id is not None:
|
||||||
|
st = session.get(Studio, scene.studio_id)
|
||||||
|
if st is not None:
|
||||||
|
studio_out = StudioOut.model_validate(st)
|
||||||
|
|
||||||
|
performer_rows = session.execute(
|
||||||
|
select(ScenePerformer, Performer)
|
||||||
|
.join(Performer, Performer.id == ScenePerformer.performer_id)
|
||||||
|
.where(ScenePerformer.scene_id == scene.id)
|
||||||
|
.order_by(ScenePerformer.position.asc().nullslast())
|
||||||
|
).all()
|
||||||
|
performers_out: list[PerformerOut] = []
|
||||||
|
for sp, performer in performer_rows:
|
||||||
|
performers_out.append(
|
||||||
|
PerformerOut(
|
||||||
|
id=performer.id,
|
||||||
|
canonical_name=performer.canonical_name,
|
||||||
|
slug=performer.slug,
|
||||||
|
gender=performer.gender.value if performer.gender else None,
|
||||||
|
as_alias=sp.as_alias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
tag_rows = (
|
||||||
|
session.execute(
|
||||||
|
select(Tag).join(SceneTag, SceneTag.tag_id == Tag.id).where(SceneTag.scene_id == scene.id)
|
||||||
|
)
|
||||||
|
.scalars()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
tags_out = [TagOut.model_validate(t) for t in tag_rows]
|
||||||
|
|
||||||
|
ref_rows = session.execute(
|
||||||
|
select(SceneExternalRef, Source)
|
||||||
|
.join(Source, Source.id == SceneExternalRef.source_id)
|
||||||
|
.where(SceneExternalRef.scene_id == scene.id)
|
||||||
|
).all()
|
||||||
|
refs_out = [
|
||||||
|
ExternalRefOut(
|
||||||
|
source=src.name,
|
||||||
|
external_id=ref.external_id,
|
||||||
|
url=ref.url,
|
||||||
|
last_seen=ref.last_seen,
|
||||||
|
)
|
||||||
|
for ref, src in ref_rows
|
||||||
|
]
|
||||||
|
|
||||||
|
playback_rows = (
|
||||||
|
session.execute(
|
||||||
|
select(PlaybackSource)
|
||||||
|
.where(
|
||||||
|
PlaybackSource.scene_id == scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None), # ukryj martwe linki
|
||||||
|
)
|
||||||
|
.order_by(PlaybackSource.origin.asc())
|
||||||
|
)
|
||||||
|
.scalars()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
playback_out: list[PlaybackSourceOut] = []
|
||||||
|
for p in playback_rows:
|
||||||
|
out = PlaybackSourceOut.model_validate(p)
|
||||||
|
# Wrap thumbnail URL-e przez backend image proxy gdy CDN wymaga Refera
|
||||||
|
# (hqporner — fastporndelivery zwraca 403 bez Referer headera, expo-image
|
||||||
|
# nie wysyła go domyślnie). Token ma 30-dniowy TTL bo thumby są stabilne.
|
||||||
|
if out.thumbnail_url and _needs_proxy(out.thumbnail_url):
|
||||||
|
out.thumbnail_url = _wrap_image_proxy(out.thumbnail_url, p.page_url)
|
||||||
|
if out.animated_thumbnail_url and _needs_proxy(out.animated_thumbnail_url):
|
||||||
|
out.animated_thumbnail_url = _wrap_image_proxy(out.animated_thumbnail_url, p.page_url)
|
||||||
|
playback_out.append(out)
|
||||||
|
|
||||||
|
progress = session.get(ScenePlayProgress, scene.id)
|
||||||
|
is_fav = session.get(FavoriteScene, scene.id) is not None
|
||||||
|
|
||||||
|
return SceneOut(
|
||||||
|
id=scene.id,
|
||||||
|
title=scene.title,
|
||||||
|
slug=scene.slug,
|
||||||
|
release_date=scene.release_date,
|
||||||
|
duration_sec=scene.duration_sec,
|
||||||
|
description=scene.description,
|
||||||
|
code=scene.code,
|
||||||
|
director=scene.director,
|
||||||
|
studio=studio_out,
|
||||||
|
performers=performers_out,
|
||||||
|
tags=tags_out,
|
||||||
|
external_refs=refs_out,
|
||||||
|
playback_sources=playback_out,
|
||||||
|
created_at=scene.created_at,
|
||||||
|
last_played_at=progress.last_played_at if progress else None,
|
||||||
|
finished=progress.finished if progress else False,
|
||||||
|
position_sec=progress.position_sec if progress else 0,
|
||||||
|
is_favorite=is_fav,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{scene_id}/tags/{tag_id}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
def remove_tag_from_scene(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
tag_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
"""Usuwa relację scene↔tag (np. user uznał że tag jest błędny dla tej sceny).
|
||||||
|
|
||||||
|
Idempotent: brak relacji = success. Nie kasuje samego Tag-a — inne sceny mogą
|
||||||
|
z niego korzystać. Sam tag zostaje w słowniku tagów.
|
||||||
|
"""
|
||||||
|
rel = session.execute(
|
||||||
|
select(SceneTag).where(SceneTag.scene_id == scene_id, SceneTag.tag_id == tag_id)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if rel is None:
|
||||||
|
return
|
||||||
|
session.delete(rel)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/{scene_id}/performers/{performer_id}", status_code=status.HTTP_204_NO_CONTENT
|
||||||
|
)
|
||||||
|
def remove_performer_from_scene(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
"""Usuwa relację scene↔performer (false-match dedup zostawił nie tą osobę).
|
||||||
|
|
||||||
|
Idempotent. Sama Performer zostaje. Użyteczne np. gdy fuzzy match aliasu
|
||||||
|
"Bella" wciągnął Anna Bella sceny pod Bad Bella, lub Miss Teela na xnxx
|
||||||
|
została przypisana do scen w których jej nie ma (zgłoszenia 2026-05-10).
|
||||||
|
"""
|
||||||
|
from app.models.scene import ScenePerformer
|
||||||
|
|
||||||
|
rel = session.execute(
|
||||||
|
select(ScenePerformer).where(
|
||||||
|
ScenePerformer.scene_id == scene_id,
|
||||||
|
ScenePerformer.performer_id == performer_id,
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if rel is None:
|
||||||
|
return
|
||||||
|
session.delete(rel)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class EnrichTagsOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
added: int
|
||||||
|
tube_used: str | None
|
||||||
|
tags: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{scene_id}/enrich-tags", response_model=EnrichTagsOut)
|
||||||
|
def enrich_tags_from_tube(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> EnrichTagsOut:
|
||||||
|
"""Pobiera page HTML z dowolnego tube playback_source dla tej sceny i scrape'uje
|
||||||
|
tagi (categories/tags). Dodaje brakujące do scene_tags.
|
||||||
|
|
||||||
|
Mobile wywołuje to przy otwarciu SceneDetail jeśli scena ma 0 tagów AND ma
|
||||||
|
tube source z obsługiwanym extractorem (porntrex/youporn/xvideos/xnxx/redtube/
|
||||||
|
xhamster/eporner).
|
||||||
|
|
||||||
|
Idempotent: ponowne wywołanie z tymi samymi tagami nic nie robi (UNIQUE PK
|
||||||
|
scene_tags). Konkretne tube źródło wybierane wg priority listy (mainstream
|
||||||
|
bardziej rzetelne niż aggregator).
|
||||||
|
"""
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
from app.extractors.tag_extract import EXTRACTORS, extract_tags
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.tag import Tag
|
||||||
|
from app.normalize.scenes import NormalizedTag
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
from app.resolve.tag_resolver import resolve_tag
|
||||||
|
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
|
||||||
|
# Priority: mainstream tubes (bogate metadane) > niche (mniej tagów albo garbage).
|
||||||
|
PRIORITY = ["xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
|
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom"]
|
||||||
|
sources = session.execute(
|
||||||
|
select(PlaybackSource).where(
|
||||||
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
).scalars().all()
|
||||||
|
|
||||||
|
# Wybierz pierwsze źródło wg priority listy które ma supported extractor
|
||||||
|
chosen: PlaybackSource | None = None
|
||||||
|
for tag in PRIORITY:
|
||||||
|
for src in sources:
|
||||||
|
if src.origin == f"tube:{tag}":
|
||||||
|
chosen = src
|
||||||
|
break
|
||||||
|
if chosen:
|
||||||
|
break
|
||||||
|
if chosen is None:
|
||||||
|
# Fallback: dowolne źródło z extractorem
|
||||||
|
for src in sources:
|
||||||
|
if src.origin.startswith("tube:"):
|
||||||
|
sitetag = src.origin.split(":", 1)[1]
|
||||||
|
if sitetag in EXTRACTORS:
|
||||||
|
chosen = src
|
||||||
|
break
|
||||||
|
|
||||||
|
if chosen is None:
|
||||||
|
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=None, tags=[])
|
||||||
|
|
||||||
|
sitetag = chosen.origin.split(":", 1)[1]
|
||||||
|
try:
|
||||||
|
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
except (TubePageError, Exception) as e:
|
||||||
|
log.warning("enrich-tags fetch failed for %s: %s", chosen.page_url, e)
|
||||||
|
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
|
||||||
|
|
||||||
|
tag_names = extract_tags(sitetag, r.text)
|
||||||
|
if not tag_names:
|
||||||
|
return EnrichTagsOut(scene_id=scene_id, added=0, tube_used=sitetag, tags=[])
|
||||||
|
|
||||||
|
# Upsert: dla każdego taga utwórz/znajdź Tag, dorzuć SceneTag idempotentnie.
|
||||||
|
# Używamy PostgreSQL INSERT ... ON CONFLICT DO NOTHING zamiast ORM session.add()
|
||||||
|
# bo `resolve_tag` robi session.flush() w pętli, emitując pending SceneTag INSERT
|
||||||
|
# z poprzednich iteracji — gdy 2 concurrent enrich-tags collide na tym samym
|
||||||
|
# (scene_id, tag_id), drugi flush dostaje UniqueViolation (GOON-H, 4 events
|
||||||
|
# w 10h mimo wcześniejszego seen_tag_ids fix). ON CONFLICT skip'uje silently.
|
||||||
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||||
|
added = 0
|
||||||
|
seen_tag_ids: set = set()
|
||||||
|
for name in tag_names:
|
||||||
|
norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
|
||||||
|
tag = resolve_tag(session, norm=norm)
|
||||||
|
if tag is None or tag.id in seen_tag_ids:
|
||||||
|
continue
|
||||||
|
seen_tag_ids.add(tag.id)
|
||||||
|
stmt = (
|
||||||
|
pg_insert(SceneTag.__table__)
|
||||||
|
.values(scene_id=scene_id, tag_id=tag.id, source_id=None)
|
||||||
|
.on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
|
||||||
|
)
|
||||||
|
result = session.execute(stmt)
|
||||||
|
# rowcount == 1 gdy faktycznie wstawiony, 0 gdy ON CONFLICT skip
|
||||||
|
if result.rowcount and result.rowcount > 0:
|
||||||
|
added += 1
|
||||||
|
session.commit()
|
||||||
|
return EnrichTagsOut(scene_id=scene_id, added=added, tube_used=sitetag, tags=tag_names)
|
||||||
|
|
||||||
|
|
||||||
|
class EnrichDurationOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
duration_sec: int | None
|
||||||
|
tube_used: str | None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{scene_id}/enrich-duration", response_model=EnrichDurationOut)
|
||||||
|
def enrich_duration_from_tube(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> EnrichDurationOut:
|
||||||
|
"""Wyciąga duration z dowolnego tube playback_source — wszystkie znane tube'y
|
||||||
|
udostępniają duration na detail page (og:video:duration lub LD-JSON ISO 8601).
|
||||||
|
|
||||||
|
Mobile wywołuje to przy otwarciu SceneDetail gdy scene.duration_sec jest null
|
||||||
|
AND ma tube source. Dla dedupu duration to najsilniejszy single signal — bez
|
||||||
|
niego sceny z weak title-only score są capowane na 0.85 (review queue).
|
||||||
|
|
||||||
|
Idempotent: zwraca aktualne duration_sec jeśli już ustawione.
|
||||||
|
"""
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
from app.extractors.duration_extract import extract_duration_sec
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
|
||||||
|
if scene.duration_sec is not None:
|
||||||
|
return EnrichDurationOut(
|
||||||
|
scene_id=scene_id, duration_sec=scene.duration_sec, tube_used=None
|
||||||
|
)
|
||||||
|
|
||||||
|
sources = session.execute(
|
||||||
|
select(PlaybackSource).where(
|
||||||
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.origin.like("tube:%"),
|
||||||
|
)
|
||||||
|
).scalars().all()
|
||||||
|
|
||||||
|
for src in sources:
|
||||||
|
try:
|
||||||
|
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
except (TubePageError, Exception) as e:
|
||||||
|
log.debug("enrich-duration fetch failed for %s: %s", src.page_url, e)
|
||||||
|
continue
|
||||||
|
d = extract_duration_sec(r.text)
|
||||||
|
if d is not None and d > 0:
|
||||||
|
scene.duration_sec = d
|
||||||
|
# Zapisz też na poziomie playback_source dla parity (przyda się jeśli
|
||||||
|
# potem dorobimy per-source duration mismatch detection).
|
||||||
|
if src.duration_sec is None:
|
||||||
|
src.duration_sec = d
|
||||||
|
session.commit()
|
||||||
|
return EnrichDurationOut(
|
||||||
|
scene_id=scene_id,
|
||||||
|
duration_sec=d,
|
||||||
|
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return EnrichDurationOut(scene_id=scene_id, duration_sec=None, tube_used=None)
|
||||||
|
|
||||||
|
|
||||||
|
class EnrichStudioOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
studio_id: uuid.UUID | None
|
||||||
|
studio_name: str | None
|
||||||
|
tube_used: str | None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{scene_id}/enrich-studio", response_model=EnrichStudioOut)
|
||||||
|
def enrich_studio_from_tube(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> EnrichStudioOut:
|
||||||
|
"""Wyciąga studio (DVD/series) z pornhat scene page'a.
|
||||||
|
|
||||||
|
Pornhat ma `class="info-video js-ajax-dvd" data-setup='{"title": "Adult Time", ...}'`
|
||||||
|
dla studio. Inne tube'y obsługiwane będą gdy znajdziemy ich pattern — na razie
|
||||||
|
tylko pornhat (najczystsze studio metadata wśród free tubes).
|
||||||
|
"""
|
||||||
|
import json as _json
|
||||||
|
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.studio import Studio
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
|
||||||
|
if scene.studio_id is not None:
|
||||||
|
existing = session.get(Studio, scene.studio_id)
|
||||||
|
return EnrichStudioOut(
|
||||||
|
scene_id=scene_id,
|
||||||
|
studio_id=scene.studio_id,
|
||||||
|
studio_name=existing.name if existing else None,
|
||||||
|
tube_used=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
chosen = session.execute(
|
||||||
|
select(PlaybackSource).where(
|
||||||
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.origin == "tube:pornhatcom",
|
||||||
|
)
|
||||||
|
).scalars().first()
|
||||||
|
if chosen is None:
|
||||||
|
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used=None)
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = browser_get(chosen.page_url, timeout=15.0, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
except (TubePageError, Exception) as e:
|
||||||
|
log.warning("enrich-studio fetch failed for %s: %s", chosen.page_url, e)
|
||||||
|
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
|
||||||
|
|
||||||
|
m = re.search(
|
||||||
|
r"class=\"info-video js-ajax-dvd[^\"]*\"[^>]*data-setup='([^']+)'",
|
||||||
|
r.text, re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m is None:
|
||||||
|
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
|
||||||
|
try:
|
||||||
|
data = _json.loads(m.group(1))
|
||||||
|
except _json.JSONDecodeError:
|
||||||
|
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
|
||||||
|
|
||||||
|
name = (data.get("title") or "").strip()
|
||||||
|
if not name:
|
||||||
|
return EnrichStudioOut(scene_id=scene_id, studio_id=None, studio_name=None, tube_used="pornhatcom")
|
||||||
|
slug = (data.get("dir") or "").strip() or slugify(name)
|
||||||
|
|
||||||
|
studio = session.execute(
|
||||||
|
select(Studio).where(Studio.slug == slug)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if studio is None:
|
||||||
|
studio = session.execute(
|
||||||
|
select(Studio).where(Studio.name == name)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if studio is None:
|
||||||
|
studio = Studio(name=name, slug=slug)
|
||||||
|
session.add(studio)
|
||||||
|
session.flush()
|
||||||
|
scene.studio_id = studio.id
|
||||||
|
session.commit()
|
||||||
|
return EnrichStudioOut(
|
||||||
|
scene_id=scene_id, studio_id=studio.id, studio_name=studio.name, tube_used="pornhatcom"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EnrichThumbOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
thumbnail_url: str | None
|
||||||
|
tube_used: str | None
|
||||||
|
sources_updated: int
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{scene_id}/enrich-thumbnail", response_model=EnrichThumbOut)
|
||||||
|
def enrich_thumbnail_from_tube(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> EnrichThumbOut:
|
||||||
|
"""Pobiera detail page z dowolnego tube playback_source bez thumbnail_url
|
||||||
|
i wyciąga miniaturkę (og:image / twitter:image / LD-JSON thumbnailUrl /
|
||||||
|
KVS html5player).
|
||||||
|
|
||||||
|
Update'uje WSZYSTKIE PlaybackSource'y dla tej sceny które nie mają thumb,
|
||||||
|
żeby kolejne otwarcia listy widziały miniaturę niezależnie od source pick.
|
||||||
|
Mobile auto-wywoła to przy otwarciu SceneDetail bez thumb (jak duration).
|
||||||
|
"""
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
from app.extractors.thumb_extract import extract_thumbnail_url
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
|
||||||
|
scene = session.get(Scene, scene_id)
|
||||||
|
if scene is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
|
||||||
|
sources = session.execute(
|
||||||
|
select(PlaybackSource).where(
|
||||||
|
PlaybackSource.scene_id == scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.origin.like("tube:%"),
|
||||||
|
)
|
||||||
|
).scalars().all()
|
||||||
|
|
||||||
|
sources_with_thumb = [s for s in sources if s.thumbnail_url]
|
||||||
|
if sources_with_thumb:
|
||||||
|
# już mamy — idempotent return.
|
||||||
|
return EnrichThumbOut(
|
||||||
|
scene_id=scene_id,
|
||||||
|
thumbnail_url=sources_with_thumb[0].thumbnail_url,
|
||||||
|
tube_used=None,
|
||||||
|
sources_updated=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
for src in sources:
|
||||||
|
try:
|
||||||
|
r = browser_get(src.page_url, timeout=15.0, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
except (TubePageError, Exception) as e:
|
||||||
|
log.debug("enrich-thumbnail fetch failed for %s: %s", src.page_url, e)
|
||||||
|
continue
|
||||||
|
thumb = extract_thumbnail_url(r.text)
|
||||||
|
if thumb:
|
||||||
|
# Zapisz na wszystkich źródłach bez thumb (oszczędza duplikat fetch)
|
||||||
|
updated = 0
|
||||||
|
for s in sources:
|
||||||
|
if not s.thumbnail_url:
|
||||||
|
s.thumbnail_url = thumb
|
||||||
|
updated += 1
|
||||||
|
session.commit()
|
||||||
|
return EnrichThumbOut(
|
||||||
|
scene_id=scene_id,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
tube_used=src.origin.split(":", 1)[1] if ":" in src.origin else None,
|
||||||
|
sources_updated=updated,
|
||||||
|
)
|
||||||
|
|
||||||
|
return EnrichThumbOut(
|
||||||
|
scene_id=scene_id, thumbnail_url=None, tube_used=None, sources_updated=0
|
||||||
|
)
|
||||||
127
app/api/schemas.py
Normal file
127
app/api/schemas.py
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
"""Pydantic schemas eksportowane przez API."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class StudioOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
network: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class PerformerOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
canonical_name: str
|
||||||
|
slug: str
|
||||||
|
gender: str | None = None
|
||||||
|
as_alias: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class TagOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalRefOut(BaseModel):
|
||||||
|
source: str
|
||||||
|
external_id: str
|
||||||
|
url: str | None = None
|
||||||
|
last_seen: datetime | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class PlaybackSourceOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
origin: str
|
||||||
|
page_url: str
|
||||||
|
embed_url: str | None = None
|
||||||
|
stream_url: str | None = None
|
||||||
|
quality: str | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
thumbnail_url: str | None = None
|
||||||
|
animated_thumbnail_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class SceneOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
title: str
|
||||||
|
slug: str | None = None
|
||||||
|
release_date: date | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
description: str | None = None
|
||||||
|
code: str | None = None
|
||||||
|
director: str | None = None
|
||||||
|
studio: StudioOut | None = None
|
||||||
|
performers: list[PerformerOut] = []
|
||||||
|
tags: list[TagOut] = []
|
||||||
|
external_refs: list[ExternalRefOut] = []
|
||||||
|
playback_sources: list[PlaybackSourceOut] = []
|
||||||
|
# Kiedy scena trafiła do bazy (ingest). Używane przez mobile do oznaczenia
|
||||||
|
# "NEW" na karcie scen w PerformerScenesScreen / StudioScenesScreen — gdy
|
||||||
|
# `created_at > last_seen_at` (favorite) → badge.
|
||||||
|
created_at: datetime | None = None
|
||||||
|
# Watched indicator (z `scene_play_progress`): mobile dim'uje kafelek gdy
|
||||||
|
# `finished=True`, pokazuje progress bar gdy `position_sec > 0`.
|
||||||
|
last_played_at: datetime | None = None
|
||||||
|
finished: bool = False
|
||||||
|
position_sec: int = 0
|
||||||
|
is_favorite: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class SceneListOut(BaseModel):
|
||||||
|
items: list[SceneOut]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
|
|
||||||
|
|
||||||
|
class MovieChapterOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
chapter_index: int
|
||||||
|
title: str | None = None
|
||||||
|
start_sec: int | None = None
|
||||||
|
end_sec: int | None = None
|
||||||
|
scene_id: uuid.UUID | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class MovieOut(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
title: str
|
||||||
|
slug: str | None = None
|
||||||
|
release_year: int | None = None
|
||||||
|
release_date: date | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
description: str | None = None
|
||||||
|
director: str | None = None
|
||||||
|
country: str | None = None
|
||||||
|
rating: float | None = None
|
||||||
|
poster_url: str | None = None
|
||||||
|
backdrop_url: str | None = None
|
||||||
|
studio: StudioOut | None = None
|
||||||
|
performers: list[PerformerOut] = []
|
||||||
|
tags: list[TagOut] = []
|
||||||
|
chapters: list[MovieChapterOut] = []
|
||||||
|
external_refs: list[ExternalRefOut] = []
|
||||||
|
playback_sources: list[PlaybackSourceOut] = []
|
||||||
|
# Used by mobile MoviesScreen NEW badge (created_at > client-stored seenSince)
|
||||||
|
# and MovieDetail favorite star.
|
||||||
|
created_at: datetime | None = None
|
||||||
|
is_favorite: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class MovieListOut(BaseModel):
|
||||||
|
items: list[MovieOut]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
553
app/api/stream_proxy.py
Normal file
553
app/api/stream_proxy.py
Normal file
|
|
@ -0,0 +1,553 @@
|
||||||
|
"""Stream proxy — pomost VPS↔phone dla podpisanych URL-i CDN-ów.
|
||||||
|
|
||||||
|
Wiele hosterów (luluvids/medixiru/cdnvids/bigcdn) bindą podpisany URL do IP klienta
|
||||||
|
który fetchował embed page. Gdy backend ekstraktuje URL z VPS-a, signature
|
||||||
|
weryfikuje VPS IP — telefon dostaje 403. Player na phonie kieruje requesty
|
||||||
|
*przez backend* (tym samym IP co podczas extracji) → CDN sprawdza signature
|
||||||
|
poprawnie i serwuje content.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. /resolve packuje (url, referer) w token (HMAC-podpisany).
|
||||||
|
2. Mobile dostaje `stream_url = /proxy/{token}/master.m3u8` (lub `.mp4`).
|
||||||
|
3. ExoPlayer woła backend → backend strumieniuje content z origin URL.
|
||||||
|
4. HLS: m3u8 manifest jest rewrited tak, że dziecięce segmenty/playlisty
|
||||||
|
też idą przez proxy (chained tokens).
|
||||||
|
|
||||||
|
Token: base64url(json({u: url, r: referer, exp: unix_ts})) + HMAC-SHA256
|
||||||
|
podpisany shared secret z env (`STREAM_PROXY_SECRET`). TTL 4h żeby gracz mógł
|
||||||
|
oglądać dłuższe sceny + pause/seek bez ryzyka expired token.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
import hmac
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from typing import Annotated
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
||||||
|
from fastapi.responses import Response, StreamingResponse
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/proxy", tags=["proxy"])
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# In-memory bandwidth counter — bytes-out per CDN domain per hour bucket.
|
||||||
|
# Restart api resetuje counter (akceptowalne — to operational metric, nie billing).
|
||||||
|
# Critical dla widzenia gdzie VPS bandwidth wycieka przed Hetzner overage.
|
||||||
|
from collections import defaultdict
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
|
_bw_counters: dict[str, dict[int, int]] = defaultdict(lambda: defaultdict(int))
|
||||||
|
_bw_lock = Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _record_proxy_bytes(target_url: str, n_bytes: int) -> None:
|
||||||
|
"""Append n_bytes to current hour bucket for given target CDN domain.
|
||||||
|
Auto-prunes buckets older than 7 days. Thread-safe."""
|
||||||
|
if n_bytes <= 0:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
host = urlparse(target_url).hostname or "unknown"
|
||||||
|
except Exception:
|
||||||
|
host = "unknown"
|
||||||
|
hour = int(time.time() // 3600)
|
||||||
|
with _bw_lock:
|
||||||
|
_bw_counters[host][hour] += n_bytes
|
||||||
|
# Prune >7d (keep counter map small)
|
||||||
|
cutoff = hour - 168
|
||||||
|
old = [h for h in _bw_counters[host] if h < cutoff]
|
||||||
|
for h in old:
|
||||||
|
del _bw_counters[host][h]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bandwidth_stats(hours: int = 24) -> dict[str, int]:
|
||||||
|
"""Returns {cdn_domain: bytes_out_in_last_N_hours}, sorted desc by bytes."""
|
||||||
|
now_hour = int(time.time() // 3600)
|
||||||
|
cutoff = now_hour - hours
|
||||||
|
result: dict[str, int] = {}
|
||||||
|
with _bw_lock:
|
||||||
|
for cdn, buckets in _bw_counters.items():
|
||||||
|
total = sum(b for h, b in buckets.items() if h > cutoff)
|
||||||
|
if total > 0:
|
||||||
|
result[cdn] = total
|
||||||
|
return dict(sorted(result.items(), key=lambda kv: -kv[1]))
|
||||||
|
|
||||||
|
DEFAULT_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
TOKEN_TTL_SEC = 4 * 60 * 60 # 4h
|
||||||
|
HOP_BY_HOP = {
|
||||||
|
"connection",
|
||||||
|
"keep-alive",
|
||||||
|
"proxy-authenticate",
|
||||||
|
"proxy-authorization",
|
||||||
|
"te",
|
||||||
|
"trailers",
|
||||||
|
"transfer-encoding",
|
||||||
|
"upgrade",
|
||||||
|
"content-encoding",
|
||||||
|
"content-length",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _secret() -> bytes:
|
||||||
|
s = os.environ.get("STREAM_PROXY_SECRET") or os.environ.get("API_KEYS", "")
|
||||||
|
if not s:
|
||||||
|
raise RuntimeError("STREAM_PROXY_SECRET (or API_KEYS) must be set")
|
||||||
|
return s.encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def make_token(
|
||||||
|
url: str,
|
||||||
|
referer: str | None = None,
|
||||||
|
ttl_sec: int = TOKEN_TTL_SEC,
|
||||||
|
*,
|
||||||
|
refresh: str | None = None,
|
||||||
|
refresh_hoster: str | None = None,
|
||||||
|
impersonate: bool = False,
|
||||||
|
) -> str:
|
||||||
|
"""Build proxy token.
|
||||||
|
|
||||||
|
`refresh`: URL embed page do refetch gdy `url` zwraca 4xx. Proxy odbierze
|
||||||
|
fresh stream URL z embed (np. mixdrop MDCore.wurl) gdy oryginalny token expired.
|
||||||
|
`refresh_hoster`: hoster name dla refresh logic (mixdrop / etc.) — proxy
|
||||||
|
dispatch do dedicated re-extract logic.
|
||||||
|
`impersonate`: użyć curl_cffi chrome120 zamiast httpx (dla hosterów z JA3 bot
|
||||||
|
detection — mxcontent, cloudflare-protected).
|
||||||
|
"""
|
||||||
|
payload: dict = {"u": url, "r": referer or "", "e": int(time.time()) + ttl_sec}
|
||||||
|
if refresh:
|
||||||
|
payload["rf"] = refresh
|
||||||
|
if refresh_hoster:
|
||||||
|
payload["rh"] = refresh_hoster
|
||||||
|
if impersonate:
|
||||||
|
payload["i"] = 1
|
||||||
|
raw = json.dumps(payload, separators=(",", ":")).encode("utf-8")
|
||||||
|
body = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii")
|
||||||
|
sig = base64.urlsafe_b64encode(
|
||||||
|
hmac.new(_secret(), raw, hashlib.sha256).digest()
|
||||||
|
).rstrip(b"=").decode("ascii")
|
||||||
|
return f"{body}.{sig}"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_token(token: str) -> dict:
|
||||||
|
try:
|
||||||
|
body_b64, sig_b64 = token.split(".", 1)
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=400, detail="malformed token") from None
|
||||||
|
raw = base64.urlsafe_b64decode(body_b64 + "==")
|
||||||
|
expected = base64.urlsafe_b64encode(
|
||||||
|
hmac.new(_secret(), raw, hashlib.sha256).digest()
|
||||||
|
).rstrip(b"=").decode("ascii")
|
||||||
|
if not hmac.compare_digest(expected, sig_b64):
|
||||||
|
raise HTTPException(status_code=403, detail="bad token sig")
|
||||||
|
payload = json.loads(raw)
|
||||||
|
if int(payload.get("e", 0)) < int(time.time()):
|
||||||
|
raise HTTPException(status_code=410, detail="token expired")
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def _ascii_safe_url(url: str) -> str:
|
||||||
|
"""Encode non-ASCII chars w URL path/query, zachowując reserved chars dla URI.
|
||||||
|
httpx wymaga ASCII headers — Referer z polskim/cyrillic/unicode (np. hqporner
|
||||||
|
`Honies_№2.html`) wcześniej throw'ował UnicodeEncodeError (GOON-A). `quote`
|
||||||
|
z `safe=":/?#[]@!$&'()*+,;=%"` zostawia URI structure nietkniętą, tylko
|
||||||
|
enkoduje znaki spoza ASCII."""
|
||||||
|
try:
|
||||||
|
from urllib.parse import quote
|
||||||
|
return quote(url, safe=":/?#[]@!$&'()*+,;=%~")
|
||||||
|
except Exception:
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def _build_headers(referer: str | None) -> dict[str, str]:
|
||||||
|
h = {
|
||||||
|
"User-Agent": DEFAULT_UA,
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
if referer:
|
||||||
|
h["Referer"] = _ascii_safe_url(referer)
|
||||||
|
try:
|
||||||
|
host = urlparse(referer).hostname
|
||||||
|
if host:
|
||||||
|
h["Origin"] = _ascii_safe_url("https://" + host)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
_M3U8_URI_RE = re.compile(r'(URI=")([^"]+)(")', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _rewrite_m3u8(content: str, base_url: str, referer: str | None) -> str:
|
||||||
|
"""Rewrite m3u8 manifest tak, że wszystkie sub-resourcey idą przez proxy.
|
||||||
|
|
||||||
|
HLS manifest ma:
|
||||||
|
- linie URI (segmenty .ts / sub-playlisty .m3u8) — relatywne lub absolute
|
||||||
|
- tagi typu `#EXT-X-KEY:METHOD=AES-128,URI="key.bin"` — też potrzebują rewrite
|
||||||
|
Każdy URL → token + /proxy/{token}/<basename>.<ext>.
|
||||||
|
"""
|
||||||
|
out: list[str] = []
|
||||||
|
for raw_line in content.splitlines():
|
||||||
|
line = raw_line.strip()
|
||||||
|
if not line:
|
||||||
|
out.append(raw_line)
|
||||||
|
continue
|
||||||
|
if line.startswith("#"):
|
||||||
|
# Match URI="..." inside #EXT-X-KEY / #EXT-X-MEDIA / etc.
|
||||||
|
def _sub(m: re.Match) -> str:
|
||||||
|
inner = urljoin(base_url, m.group(2))
|
||||||
|
t = make_token(inner, referer)
|
||||||
|
return f'{m.group(1)}/proxy/{t}/seg{m.group(3)}'
|
||||||
|
new_line = _M3U8_URI_RE.sub(_sub, raw_line)
|
||||||
|
out.append(new_line)
|
||||||
|
continue
|
||||||
|
# Resource URI line
|
||||||
|
absolute = urljoin(base_url, line)
|
||||||
|
t = make_token(absolute, referer)
|
||||||
|
# Zachowaj rozszerzenie żeby ExoPlayer rozpoznał content-type:
|
||||||
|
ext = os.path.splitext(urlparse(absolute).path)[1].lstrip(".") or "ts"
|
||||||
|
out.append(f"/proxy/{t}/seg.{ext}")
|
||||||
|
return "\n".join(out) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sign")
|
||||||
|
def sign_url(
|
||||||
|
_api: Annotated[None, Depends(require_api_key)],
|
||||||
|
url: str = Query(...),
|
||||||
|
referer: str | None = Query(default=None),
|
||||||
|
) -> dict:
|
||||||
|
"""Pomocniczy endpoint dla mobile do uzyskania świeżego tokena (np. po expiry).
|
||||||
|
Normalnie /resolve zwraca już proxy URL — to fallback."""
|
||||||
|
return {"token": make_token(url, referer), "expires_in": TOKEN_TTL_SEC}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/img/{token}/{_basename:path}")
|
||||||
|
async def proxy_image(
|
||||||
|
token: str,
|
||||||
|
_basename: str,
|
||||||
|
request: Request,
|
||||||
|
) -> Response:
|
||||||
|
"""Image proxy — używany dla thumbnaili z CDN-ów wymagających Referera
|
||||||
|
(hqporner i inne porn-app sourcy). Mobile expo-image nie wysyła Referera
|
||||||
|
domyślnie, CDN zwraca 403. Backend dodaje Referer i streamuje obrazek.
|
||||||
|
|
||||||
|
Cache-Control: public,max-age=86400 — thumby są stabilne, klient może cachować."""
|
||||||
|
payload = parse_token(token)
|
||||||
|
target = payload["u"]
|
||||||
|
referer = payload["r"] or None
|
||||||
|
headers = _build_headers(referer)
|
||||||
|
timeout = httpx.Timeout(connect=10.0, read=30.0, write=15.0, pool=5.0)
|
||||||
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||||
|
try:
|
||||||
|
r = await client.get(target, headers=headers)
|
||||||
|
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
|
||||||
|
# CDN connect/timeout — transient (np. Cloudflare 523 origin unreachable
|
||||||
|
# gdy upstream host jest off). Log INFO + 503, mobile renderuje placeholder.
|
||||||
|
# Bez tego Sentry dostawał setki ERROR-ów (GOON-D/6) z każdym broken
|
||||||
|
# tube'em — spam-szumiło real-issues.
|
||||||
|
log.info("img proxy connect/timeout for %s: %s", target, e)
|
||||||
|
return Response(content=b"", status_code=503, media_type="image/jpeg")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("img proxy fetch failed for %s: %s", target, e)
|
||||||
|
raise HTTPException(status_code=502, detail=f"img fetch failed: {e}") from e
|
||||||
|
if r.status_code >= 400:
|
||||||
|
# Upstream 4xx/5xx dla thumba — degraded zamiast raise (placeholder w mobile).
|
||||||
|
# GOON-5 (Cloudflare 523) i GOON-D — bezsensowny noise w Sentry, lepiej
|
||||||
|
# info log + 502 pass-through bez exception.
|
||||||
|
log.info("img proxy upstream %d for %s", r.status_code, target)
|
||||||
|
return Response(
|
||||||
|
content=b"",
|
||||||
|
status_code=502 if r.status_code >= 500 else r.status_code,
|
||||||
|
media_type="image/jpeg",
|
||||||
|
)
|
||||||
|
ct = r.headers.get("content-type", "image/jpeg")
|
||||||
|
return Response(
|
||||||
|
content=r.content,
|
||||||
|
media_type=ct,
|
||||||
|
headers={"Cache-Control": "public, max-age=86400"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _refetch_mixdrop_url(session: "AsyncSession", embed_url: str) -> str | None:
|
||||||
|
"""Re-fetch mixdrop embed, decode P.A.C.K.E.R., extract fresh MDCore.wurl.
|
||||||
|
Cookies persist w session, użytkowane potem do mp4 GET (same-session bind).
|
||||||
|
UA + Accept wymagane — bez tego mixdrop zwraca minimalny body (bez packera).
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from yt_dlp.utils import decode_packed_codes
|
||||||
|
embed_headers = {
|
||||||
|
"User-Agent": DEFAULT_UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
r = await session.get(embed_url, headers=embed_headers, impersonate="chrome120",
|
||||||
|
timeout=15, allow_redirects=True)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return None
|
||||||
|
m = re.search(r"eval\(function\(p,a,c,k,e,d\)\{.+?\}\(.+?\)\)", r.text, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
decoded = decode_packed_codes(m.group(0))
|
||||||
|
url_m = re.search(r'MDCore\.wurl\s*=\s*"([^"]+\.mp4[^"]*)"', decoded)
|
||||||
|
if not url_m:
|
||||||
|
return None
|
||||||
|
url = url_m.group(1)
|
||||||
|
if url.startswith("//"):
|
||||||
|
url = "https:" + url
|
||||||
|
return url
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("refetch mixdrop failed for %s: %s", embed_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def _curl_cffi_stream(
|
||||||
|
target: str,
|
||||||
|
headers: dict,
|
||||||
|
*,
|
||||||
|
refetch_url: str | None = None,
|
||||||
|
refetch_hoster: str | None = None,
|
||||||
|
) -> Response:
|
||||||
|
"""Fallback dla hosterów które detect plain httpx JA3 jako bot (mxcontent,
|
||||||
|
cloudflare-protected CDNs). curl_cffi async z chrome120 impersonate ma
|
||||||
|
identyczny TLS fingerprint jak prawdziwy Chrome → CDN go przepuszcza.
|
||||||
|
|
||||||
|
Gdy `refetch_url` ustawione i mp4 GET zwraca 4xx, re-fetcha embed page
|
||||||
|
w SAME session żeby odświeżyć cookies + dostać nowy mp4 URL (same-session
|
||||||
|
bind dla mxcontent). Bez tego mixdrop mp4 token expires + brak cookies → 403.
|
||||||
|
"""
|
||||||
|
from curl_cffi.requests import AsyncSession
|
||||||
|
|
||||||
|
session = AsyncSession()
|
||||||
|
try:
|
||||||
|
# Dla mixdrop: ZAWSZE refetch embed jako PIERWSZE (przed mp4) żeby session
|
||||||
|
# miała fresh cookies. Initial mp4 attempt z expired/old token + brak
|
||||||
|
# cookies = 403 + anti-bot flag w cookies → blokuje retry też.
|
||||||
|
if refetch_url and refetch_hoster == "mixdrop":
|
||||||
|
new_mp4 = await _refetch_mixdrop_url(session, refetch_url)
|
||||||
|
if new_mp4:
|
||||||
|
target = new_mp4
|
||||||
|
log.info("mixdrop fresh-extract mp4 %s", new_mp4[:80])
|
||||||
|
|
||||||
|
upstream = await session.get(
|
||||||
|
target,
|
||||||
|
headers=headers,
|
||||||
|
impersonate="chrome120",
|
||||||
|
stream=True,
|
||||||
|
timeout=120,
|
||||||
|
allow_redirects=True,
|
||||||
|
)
|
||||||
|
log.info("mixdrop mp4 fetch %s → %d", target[:60], upstream.status_code)
|
||||||
|
if upstream.status_code >= 400:
|
||||||
|
await session.close()
|
||||||
|
return _upstream_error_response(upstream.status_code, dict(upstream.headers), target)
|
||||||
|
|
||||||
|
out_headers = {
|
||||||
|
k: v for k, v in upstream.headers.items() if k.lower() not in HOP_BY_HOP
|
||||||
|
}
|
||||||
|
|
||||||
|
async def streamer():
|
||||||
|
bytes_out = 0
|
||||||
|
try:
|
||||||
|
async for chunk in upstream.aiter_content():
|
||||||
|
bytes_out += len(chunk)
|
||||||
|
yield chunk
|
||||||
|
finally:
|
||||||
|
await session.close()
|
||||||
|
_record_proxy_bytes(target, bytes_out)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
streamer(),
|
||||||
|
status_code=upstream.status_code,
|
||||||
|
headers=out_headers,
|
||||||
|
media_type=upstream.headers.get("content-type", "application/octet-stream"),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
await session.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
log.warning("curl_cffi proxy failed for %s: %s", target, e)
|
||||||
|
raise HTTPException(status_code=502, detail=f"proxy error: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{token}/{_basename:path}")
|
||||||
|
async def proxy_stream(
|
||||||
|
token: str,
|
||||||
|
_basename: str,
|
||||||
|
request: Request,
|
||||||
|
) -> Response:
|
||||||
|
payload = parse_token(token)
|
||||||
|
target = payload["u"]
|
||||||
|
referer = payload["r"] or None
|
||||||
|
use_impersonate = bool(payload.get("i"))
|
||||||
|
refetch_url = payload.get("rf")
|
||||||
|
refetch_hoster = payload.get("rh")
|
||||||
|
|
||||||
|
# Forwardujemy Range header (HLS/MP4 player robi byte-range fetches dla seek/preload)
|
||||||
|
headers = _build_headers(referer)
|
||||||
|
range_h = request.headers.get("range")
|
||||||
|
if range_h:
|
||||||
|
headers["Range"] = range_h
|
||||||
|
|
||||||
|
method = "GET" # ExoPlayer głównie GET; HEAD nie potrzebny — proxy zwraca pełne odpowiedzi
|
||||||
|
|
||||||
|
# Hostery które wymagają Chrome JA3 fingerprint (mxcontent / cloudflare-protected
|
||||||
|
# CDNs) — od razu używamy curl_cffi zamiast httpx żeby uniknąć 403→retry round-trip.
|
||||||
|
# Token `i=1` flag ustawiana przez extractor dla tych hostów (mixdrop.py).
|
||||||
|
if use_impersonate:
|
||||||
|
return await _curl_cffi_stream(
|
||||||
|
target, headers,
|
||||||
|
refetch_url=refetch_url, refetch_hoster=refetch_hoster,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Krótszy timeout na request, ale długi read żeby streaming nie zerwał
|
||||||
|
timeout = httpx.Timeout(connect=15.0, read=120.0, write=30.0, pool=10.0)
|
||||||
|
parsed = urlparse(target)
|
||||||
|
path_lower = parsed.path.lower()
|
||||||
|
# Path-hint dla wstępnej decyzji, ale FINAL decyzja po content-type response.
|
||||||
|
# Powód: pornhat `get_file/.../<id>.mp4/` 302 → CDN m3u8 manifest mimo `.mp4`
|
||||||
|
# w path. Bez content-type check proxy traktuje jako binary, mobile dostaje
|
||||||
|
# m3u8 z RAW CDN URLs (IP-bound do VPS) → "no extractors" w ExoPlayer.
|
||||||
|
path_suggests_m3u8 = path_lower.endswith(".m3u8")
|
||||||
|
|
||||||
|
client = httpx.AsyncClient(timeout=timeout, follow_redirects=True)
|
||||||
|
try:
|
||||||
|
# Sprobój streaming send PIERWSZY — sprawdź content-type po headers,
|
||||||
|
# potem decyzja: rewrite manifest vs stream binary.
|
||||||
|
upstream = await client.send(
|
||||||
|
client.build_request(method, target, headers=headers),
|
||||||
|
stream=True,
|
||||||
|
follow_redirects=True,
|
||||||
|
)
|
||||||
|
if upstream.status_code >= 400:
|
||||||
|
status = upstream.status_code
|
||||||
|
ups_headers = dict(upstream.headers)
|
||||||
|
await upstream.aclose()
|
||||||
|
await client.aclose()
|
||||||
|
return _upstream_error_response(status, ups_headers, target)
|
||||||
|
|
||||||
|
ct = (upstream.headers.get("content-type") or "").lower()
|
||||||
|
is_m3u8 = (
|
||||||
|
path_suggests_m3u8
|
||||||
|
or "mpegurl" in ct
|
||||||
|
or "application/x-mpegurl" in ct
|
||||||
|
)
|
||||||
|
if is_m3u8:
|
||||||
|
# Manifest content — buffer fully, rewrite, return as m3u8.
|
||||||
|
body = await upstream.aread()
|
||||||
|
await upstream.aclose()
|
||||||
|
await client.aclose()
|
||||||
|
try:
|
||||||
|
rewritten = _rewrite_m3u8(body.decode("utf-8", errors="replace"),
|
||||||
|
base_url=str(upstream.url), referer=referer)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("m3u8 rewrite failed for %s: %s", target, e)
|
||||||
|
raise HTTPException(status_code=502, detail="manifest rewrite failed") from e
|
||||||
|
return Response(
|
||||||
|
content=rewritten,
|
||||||
|
media_type="application/vnd.apple.mpegurl",
|
||||||
|
headers={"Cache-Control": "no-store"},
|
||||||
|
)
|
||||||
|
|
||||||
|
out_headers = {
|
||||||
|
k: v for k, v in upstream.headers.items() if k.lower() not in HOP_BY_HOP
|
||||||
|
}
|
||||||
|
|
||||||
|
async def streamer():
|
||||||
|
bytes_out = 0
|
||||||
|
try:
|
||||||
|
async for chunk in upstream.aiter_raw():
|
||||||
|
bytes_out += len(chunk)
|
||||||
|
yield chunk
|
||||||
|
finally:
|
||||||
|
await upstream.aclose()
|
||||||
|
await client.aclose()
|
||||||
|
_record_proxy_bytes(target, bytes_out)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
streamer(),
|
||||||
|
status_code=upstream.status_code,
|
||||||
|
headers=out_headers,
|
||||||
|
media_type=upstream.headers.get("content-type", "application/octet-stream"),
|
||||||
|
)
|
||||||
|
except HTTPException:
|
||||||
|
await client.aclose()
|
||||||
|
raise
|
||||||
|
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
|
||||||
|
# CDN connect failure / timeout — transient, log na INFO (nie ERROR do Sentry).
|
||||||
|
# Zwracamy 503 zamiast 502 + Retry-After, mobile może retry-ować bez panic.
|
||||||
|
await client.aclose()
|
||||||
|
log.info("proxy connect/timeout for %s: %s", target, e)
|
||||||
|
return Response(
|
||||||
|
content=f"upstream unreachable: {type(e).__name__}",
|
||||||
|
status_code=503,
|
||||||
|
headers={"Retry-After": "5"},
|
||||||
|
media_type="text/plain",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
await client.aclose()
|
||||||
|
log.warning("proxy failed for %s: %s", target, e)
|
||||||
|
raise HTTPException(status_code=502, detail=f"proxy error: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _upstream_error_response(
|
||||||
|
status: int,
|
||||||
|
upstream_headers: dict,
|
||||||
|
target: str,
|
||||||
|
) -> Response:
|
||||||
|
"""Mapuje upstream HTTP error na nasz response.
|
||||||
|
|
||||||
|
Rationale per status:
|
||||||
|
- **429 Too Many Requests**: CDN rate-limit (np. fpo.xxx gdy proxy hammeruje
|
||||||
|
get_file/). Pass-through 429 + Retry-After żeby mobile zrobiło backoff.
|
||||||
|
Log INFO (nie ERROR) — to expected behavior CDN-a, nie nasz bug.
|
||||||
|
- **404/410**: video deleted/expired token. Pass-through żeby player wiedział.
|
||||||
|
- **5xx upstream**: pochodzi z CDN-a, nie z naszego kodu. Log INFO.
|
||||||
|
- **inne 4xx**: 502 (i Sentry warn) — może być nasza wina (bad referer itp.).
|
||||||
|
"""
|
||||||
|
retry_after = upstream_headers.get("retry-after") or upstream_headers.get("Retry-After")
|
||||||
|
if status == 429:
|
||||||
|
log.info("proxy upstream 429 for %s (Retry-After=%s)", target, retry_after)
|
||||||
|
out_headers: dict[str, str] = {"Cache-Control": "no-store"}
|
||||||
|
if retry_after:
|
||||||
|
out_headers["Retry-After"] = str(retry_after)
|
||||||
|
else:
|
||||||
|
out_headers["Retry-After"] = "10"
|
||||||
|
return Response(
|
||||||
|
content="upstream rate limited",
|
||||||
|
status_code=429,
|
||||||
|
headers=out_headers,
|
||||||
|
media_type="text/plain",
|
||||||
|
)
|
||||||
|
if status in (404, 410):
|
||||||
|
log.info("proxy upstream %d for %s", status, target)
|
||||||
|
return Response(
|
||||||
|
content=f"upstream {status}",
|
||||||
|
status_code=status,
|
||||||
|
media_type="text/plain",
|
||||||
|
)
|
||||||
|
if 500 <= status < 600:
|
||||||
|
# CDN-side error (np. Cloudflare 523 — origin unreachable). Pass-through
|
||||||
|
# 502 ale log INFO bo to nie nasza wina.
|
||||||
|
log.info("proxy upstream %d for %s", status, target)
|
||||||
|
return Response(
|
||||||
|
content=f"upstream {status}",
|
||||||
|
status_code=502,
|
||||||
|
headers={"Retry-After": "5"},
|
||||||
|
media_type="text/plain",
|
||||||
|
)
|
||||||
|
# 4xx other (403 itp.) — raise żeby Sentry zarejestrował (może bug naszego kodu)
|
||||||
|
raise HTTPException(status_code=502, detail=f"upstream {status}")
|
||||||
597
app/api/taxonomies.py
Normal file
597
app/api/taxonomies.py
Normal file
|
|
@ -0,0 +1,597 @@
|
||||||
|
"""GET /tags, /performers, /studios — listy taxonomies do filtrów na mobile.
|
||||||
|
|
||||||
|
Każdy endpoint wspiera:
|
||||||
|
- q: substring search po name_normalized (trgm fallback ilike)
|
||||||
|
- order: 'name' (alfabetycznie) | 'popular' lub 'scene_count' (po liczbie scen desc)
|
||||||
|
- page/per_page
|
||||||
|
|
||||||
|
Zwraca też scene_count żeby UI pokazywał "(123)" przy każdym tagu/performerze/studio.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
from sqlalchemy import and_, exists, func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.movie import Movie, MovieTag
|
||||||
|
from app.models.movie_playback_source import MoviePlaybackSource
|
||||||
|
from app.models.performer import Performer
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import ScenePerformer, SceneTag
|
||||||
|
from app.models.studio import Studio
|
||||||
|
from app.models.tag import Tag
|
||||||
|
|
||||||
|
router = APIRouter(tags=["taxonomies"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Schemas ----------------------------------------------------------
|
||||||
|
|
||||||
|
class TagCount(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
scene_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class TagListOut(BaseModel):
|
||||||
|
items: list[TagCount]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
|
|
||||||
|
|
||||||
|
class PerformerCount(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
canonical_name: str
|
||||||
|
slug: str
|
||||||
|
gender: str | None = None
|
||||||
|
scene_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class PerformerListOut(BaseModel):
|
||||||
|
items: list[PerformerCount]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
|
|
||||||
|
|
||||||
|
class StudioCount(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
id: uuid.UUID
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
network: str | None = None
|
||||||
|
scene_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class StudioListOut(BaseModel):
|
||||||
|
items: list[StudioCount]
|
||||||
|
total: int
|
||||||
|
page: int
|
||||||
|
per_page: int
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Endpoints --------------------------------------------------------
|
||||||
|
|
||||||
|
@router.get("/tags", response_model=TagListOut)
|
||||||
|
def list_tags(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
q: str | None = Query(default=None),
|
||||||
|
order: str = Query(default="popular", description="popular|name"),
|
||||||
|
page: int = Query(default=1, ge=1),
|
||||||
|
per_page: int = Query(default=50, ge=1, le=500),
|
||||||
|
for_movies: bool = Query(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"True: zlicza wystąpienia tagu w movies (z live MoviePlaybackSource) "
|
||||||
|
"zamiast w scenes. UI używa do filtrowania movie genres."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
only_with_content: bool = Query(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"True: ukrywa tagi z 0 wystąpieniami w wybranym typie (scenes/movies)."
|
||||||
|
" Filtruje krótkie listy filtrów żeby nie pokazywać tagów-sierot."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
) -> TagListOut:
|
||||||
|
if order not in ("popular", "scene_count", "name"):
|
||||||
|
raise HTTPException(status_code=400, detail="order must be 'popular' or 'name'")
|
||||||
|
|
||||||
|
if for_movies:
|
||||||
|
# Movie tag count — zliczamy tylko Movies z ≥1 live MoviePlaybackSource.
|
||||||
|
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce).
|
||||||
|
_movie_live = exists().where(
|
||||||
|
and_(
|
||||||
|
MoviePlaybackSource.movie_id == MovieTag.movie_id,
|
||||||
|
MoviePlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
count_sub = (
|
||||||
|
select(MovieTag.tag_id, func.count(MovieTag.movie_id).label("c"))
|
||||||
|
.where(_movie_live)
|
||||||
|
.group_by(MovieTag.tag_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# has_live_playback filter — zliczamy tylko sceny które user faktycznie zobaczy
|
||||||
|
# (TPDB/StashDB metadata-only stubs są do mergowania, nie do oglądania).
|
||||||
|
_live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == SceneTag.scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
count_sub = (
|
||||||
|
select(SceneTag.tag_id, func.count(SceneTag.scene_id).label("c"))
|
||||||
|
.where(_live_playback)
|
||||||
|
.group_by(SceneTag.tag_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
base = (
|
||||||
|
select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||||
|
.outerjoin(count_sub, count_sub.c.tag_id == Tag.id)
|
||||||
|
)
|
||||||
|
if q:
|
||||||
|
base = base.where(Tag.name.ilike(f"%{q}%"))
|
||||||
|
if only_with_content:
|
||||||
|
# exists() w outerjoin nie inner-joinowałby pustych tagów. Dlatego osobny
|
||||||
|
# exists check: pasują tylko tagi z ≥1 w subquery.
|
||||||
|
base = base.where(count_sub.c.tag_id.is_not(None))
|
||||||
|
|
||||||
|
total = session.execute(
|
||||||
|
select(func.count()).select_from(base.subquery())
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
if order in ("popular", "scene_count"):
|
||||||
|
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
|
||||||
|
else:
|
||||||
|
ordered = base.order_by(Tag.name.asc())
|
||||||
|
|
||||||
|
rows = session.execute(
|
||||||
|
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||||
|
).all()
|
||||||
|
|
||||||
|
items = [
|
||||||
|
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
|
||||||
|
for t, c in rows
|
||||||
|
]
|
||||||
|
return TagListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/performers", response_model=PerformerListOut)
|
||||||
|
def list_performers(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
q: str | None = Query(default=None, description="substring po name_normalized"),
|
||||||
|
order: str = Query(default="scene_count", description="scene_count|name"),
|
||||||
|
page: int = Query(default=1, ge=1),
|
||||||
|
per_page: int = Query(default=50, ge=1, le=500),
|
||||||
|
) -> PerformerListOut:
|
||||||
|
if order not in ("scene_count", "popular", "name"):
|
||||||
|
raise HTTPException(status_code=400, detail="order must be 'scene_count' or 'name'")
|
||||||
|
|
||||||
|
# has_live_playback filter — patrz list_tags wyżej.
|
||||||
|
_perf_live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == ScenePerformer.scene_id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
count_sub = (
|
||||||
|
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id).label("c"))
|
||||||
|
.where(_perf_live_playback)
|
||||||
|
.group_by(ScenePerformer.performer_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
base = (
|
||||||
|
select(Performer, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||||
|
.outerjoin(count_sub, count_sub.c.performer_id == Performer.id)
|
||||||
|
)
|
||||||
|
if q:
|
||||||
|
base = base.where(Performer.name_normalized.ilike(f"%{q.lower()}%"))
|
||||||
|
|
||||||
|
total = session.execute(
|
||||||
|
select(func.count()).select_from(base.subquery())
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
if order in ("scene_count", "popular"):
|
||||||
|
ordered = base.order_by(
|
||||||
|
func.coalesce(count_sub.c.c, 0).desc(), Performer.canonical_name.asc()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ordered = base.order_by(Performer.canonical_name.asc())
|
||||||
|
|
||||||
|
rows = session.execute(
|
||||||
|
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||||
|
).all()
|
||||||
|
|
||||||
|
items = [
|
||||||
|
PerformerCount(
|
||||||
|
id=p.id,
|
||||||
|
canonical_name=p.canonical_name,
|
||||||
|
slug=p.slug,
|
||||||
|
gender=p.gender.value if p.gender else None,
|
||||||
|
scene_count=int(c),
|
||||||
|
)
|
||||||
|
for p, c in rows
|
||||||
|
]
|
||||||
|
return PerformerListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/studios", response_model=StudioListOut)
|
||||||
|
def list_studios(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
q: str | None = Query(default=None),
|
||||||
|
order: str = Query(default="name", description="name|scene_count"),
|
||||||
|
page: int = Query(default=1, ge=1),
|
||||||
|
per_page: int = Query(default=50, ge=1, le=500),
|
||||||
|
for_movies: bool = Query(
|
||||||
|
default=False,
|
||||||
|
description="True: zlicza tylko studia mające ≥1 movie z live playback.",
|
||||||
|
),
|
||||||
|
only_with_content: bool = Query(
|
||||||
|
default=False,
|
||||||
|
description="True: ukrywa studia z 0 wystąpieniami w wybranym typie.",
|
||||||
|
),
|
||||||
|
) -> StudioListOut:
|
||||||
|
from app.models.scene import Scene # lokalny import — Scene FK do Studio
|
||||||
|
|
||||||
|
if order not in ("name", "scene_count", "popular"):
|
||||||
|
raise HTTPException(status_code=400, detail="order must be 'name' or 'scene_count'")
|
||||||
|
|
||||||
|
if for_movies:
|
||||||
|
_movie_live = exists().where(
|
||||||
|
and_(
|
||||||
|
MoviePlaybackSource.movie_id == Movie.id,
|
||||||
|
MoviePlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
count_sub = (
|
||||||
|
select(Movie.studio_id, func.count(Movie.id).label("c"))
|
||||||
|
.where(Movie.studio_id.is_not(None))
|
||||||
|
.where(_movie_live)
|
||||||
|
.group_by(Movie.studio_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# has_live_playback filter — patrz list_tags wyżej.
|
||||||
|
_studio_live_playback = exists().where(
|
||||||
|
and_(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
count_sub = (
|
||||||
|
select(Scene.studio_id, func.count(Scene.id).label("c"))
|
||||||
|
.where(Scene.studio_id.is_not(None))
|
||||||
|
.where(_studio_live_playback)
|
||||||
|
.group_by(Scene.studio_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
base = (
|
||||||
|
select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||||
|
.outerjoin(count_sub, count_sub.c.studio_id == Studio.id)
|
||||||
|
)
|
||||||
|
if q:
|
||||||
|
base = base.where(Studio.name.ilike(f"%{q}%"))
|
||||||
|
if only_with_content:
|
||||||
|
base = base.where(count_sub.c.studio_id.is_not(None))
|
||||||
|
|
||||||
|
total = session.execute(
|
||||||
|
select(func.count()).select_from(base.subquery())
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
if order in ("scene_count", "popular"):
|
||||||
|
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
|
||||||
|
else:
|
||||||
|
ordered = base.order_by(Studio.name_normalized.asc())
|
||||||
|
|
||||||
|
rows = session.execute(
|
||||||
|
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||||
|
).all()
|
||||||
|
|
||||||
|
items = [
|
||||||
|
StudioCount(
|
||||||
|
id=s.id,
|
||||||
|
name=s.name,
|
||||||
|
slug=s.slug,
|
||||||
|
network=s.network,
|
||||||
|
scene_count=int(c),
|
||||||
|
)
|
||||||
|
for s, c in rows
|
||||||
|
]
|
||||||
|
return StudioListOut(items=items, total=total, page=page, per_page=per_page)
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Performer refresh on-demand --------------------------------------
|
||||||
|
|
||||||
|
class PerformerRefreshOut(BaseModel):
|
||||||
|
performer_id: uuid.UUID
|
||||||
|
canonical_name: str
|
||||||
|
counters: dict[str, dict[str, int]]
|
||||||
|
new_scenes: int
|
||||||
|
last_searched_at: str | None
|
||||||
|
|
||||||
|
|
||||||
|
class PerformerRescrapeOut(BaseModel):
|
||||||
|
performer_id: uuid.UUID
|
||||||
|
canonical_name: str
|
||||||
|
scenes_total: int
|
||||||
|
scenes_processed: int
|
||||||
|
thumbs_added: int
|
||||||
|
tags_added: int
|
||||||
|
failures: int
|
||||||
|
capped: bool
|
||||||
|
cap_reason: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# Hard caps żeby request się nie wisiał i nginx (60s read timeout) nie 504'ował
|
||||||
|
# przy partial commits. 45s wall-clock + 50 scen max = ~12 fetches × 3s budgetowo.
|
||||||
|
# Większe rescrape'y user może odpalać wielokrotnie (idempotent dzięki has_thumb/
|
||||||
|
# tag_count check).
|
||||||
|
_RESCRAPE_WALL_SEC = 55.0 # nginx read timeout 60s — 5s margin na response build
|
||||||
|
_RESCRAPE_MAX_SCENES = 50
|
||||||
|
# Re-fetch tagów dla scen z < N tagami. Niektórzy performerzy mają legit 1-2 tagi
|
||||||
|
# (niche), no harm w sprawdzeniu pierwszy raz; powtarzane wywołania są idempotent
|
||||||
|
# bo INSERT ... ON CONFLICT DO NOTHING.
|
||||||
|
_TAG_RESCRAPE_THRESHOLD = 3
|
||||||
|
# Mainstream tubes priority dla tagów — bogate metadane.
|
||||||
|
_TAG_PRIORITY = [
|
||||||
|
"xhamstercom", "porntrexcom", "epornercom", "youporncom",
|
||||||
|
"xvideoscom", "xnxxcom", "redtubecom", "pornhatcom",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/performers/{performer_id}/rescrape", response_model=PerformerRescrapeOut)
|
||||||
|
def rescrape_performer_scenes(
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> PerformerRescrapeOut:
|
||||||
|
"""Re-scrapuje miniaturki + tagi z tube pages dla scen performera (bulk).
|
||||||
|
|
||||||
|
Bug-report 2026-05-16 (6fcaa5f4): per-scene enrich działa on-demand, ale dla
|
||||||
|
całej listy (np. 200 scen xhamstera) user musiałby kliknąć każdą osobno.
|
||||||
|
|
||||||
|
Cap'owane: max `_RESCRAPE_MAX_SCENES` (50) lub `_RESCRAPE_WALL_SEC` (45s),
|
||||||
|
żeby nginx 60s read timeout nie 504'ował partial commit. Większe ilości
|
||||||
|
wymagają wielu kliknięć (idempotent, scene z thumb się skipuje).
|
||||||
|
|
||||||
|
Idempotent: scena która ma już thumb i ≥3 tagi jest pomijana.
|
||||||
|
"""
|
||||||
|
import time as _time
|
||||||
|
import httpx as _httpx
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
from app.extractors.tag_extract import EXTRACTORS as TAG_EXTRACTORS, extract_tags
|
||||||
|
from app.extractors.thumb_extract import extract_thumbnail_url
|
||||||
|
from app.models.playback_source import PlaybackSource
|
||||||
|
from app.models.scene import Scene, SceneTag
|
||||||
|
from app.normalize.scenes import NormalizedTag
|
||||||
|
from app.normalize.text import slugify
|
||||||
|
from app.resolve.tag_resolver import resolve_tag
|
||||||
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||||
|
|
||||||
|
perf = session.get(Performer, performer_id)
|
||||||
|
if perf is None:
|
||||||
|
raise HTTPException(status_code=404, detail="performer not found")
|
||||||
|
|
||||||
|
# 1) ID-only query — sceny ze ≥1 alive tube playback.
|
||||||
|
scene_ids = session.execute(
|
||||||
|
select(Scene.id)
|
||||||
|
.join(ScenePerformer, ScenePerformer.scene_id == Scene.id)
|
||||||
|
.where(ScenePerformer.performer_id == performer_id)
|
||||||
|
.where(
|
||||||
|
exists().where(
|
||||||
|
PlaybackSource.scene_id == Scene.id,
|
||||||
|
PlaybackSource.dead_at.is_(None),
|
||||||
|
PlaybackSource.origin.like("tube:%"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.limit(_RESCRAPE_MAX_SCENES)
|
||||||
|
).scalars().all()
|
||||||
|
scenes_total = len(scene_ids)
|
||||||
|
|
||||||
|
if not scene_ids:
|
||||||
|
return PerformerRescrapeOut(
|
||||||
|
performer_id=performer_id,
|
||||||
|
canonical_name=perf.canonical_name,
|
||||||
|
scenes_total=0, scenes_processed=0,
|
||||||
|
thumbs_added=0, tags_added=0, failures=0,
|
||||||
|
capped=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) Batch fetch: wszystkie alive tube playback_sources dla tych scen w 1 query.
|
||||||
|
pb_rows = session.execute(
|
||||||
|
select(PlaybackSource)
|
||||||
|
.where(PlaybackSource.scene_id.in_(scene_ids))
|
||||||
|
.where(PlaybackSource.dead_at.is_(None))
|
||||||
|
.where(PlaybackSource.origin.like("tube:%"))
|
||||||
|
).scalars().all()
|
||||||
|
sources_by_scene: dict = {}
|
||||||
|
for s in pb_rows:
|
||||||
|
sources_by_scene.setdefault(s.scene_id, []).append(s)
|
||||||
|
|
||||||
|
# 3) Batch fetch tag counts per scene (1 query zamiast N).
|
||||||
|
tag_counts = dict(session.execute(
|
||||||
|
select(SceneTag.scene_id, func.count())
|
||||||
|
.where(SceneTag.scene_id.in_(scene_ids))
|
||||||
|
.group_by(SceneTag.scene_id)
|
||||||
|
).all())
|
||||||
|
|
||||||
|
thumbs_added = 0
|
||||||
|
tags_added = 0
|
||||||
|
failures = 0
|
||||||
|
scenes_processed = 0
|
||||||
|
capped = False
|
||||||
|
cap_reason: str | None = None
|
||||||
|
started = _time.monotonic()
|
||||||
|
# Narrow exception set — łapiemy TYLKO oczekiwane network/parse failures.
|
||||||
|
# `Exception` catch-all blokował KeyboardInterrupt + maskował pool exhaustion.
|
||||||
|
NET_EXC = (TubePageError, _httpx.HTTPError, OSError, ValueError)
|
||||||
|
|
||||||
|
for scene_id in scene_ids:
|
||||||
|
if _time.monotonic() - started > _RESCRAPE_WALL_SEC:
|
||||||
|
capped = True
|
||||||
|
cap_reason = f"wall-clock {_RESCRAPE_WALL_SEC}s reached"
|
||||||
|
break
|
||||||
|
|
||||||
|
sources = sources_by_scene.get(scene_id, [])
|
||||||
|
if not sources:
|
||||||
|
continue
|
||||||
|
|
||||||
|
scenes_processed += 1
|
||||||
|
has_thumb = any(s.thumbnail_url for s in sources)
|
||||||
|
existing_tag_count = tag_counts.get(scene_id, 0)
|
||||||
|
|
||||||
|
# SAVEPOINT — fail isolation. Pojedyncza scena z FK violation w SceneTag
|
||||||
|
# insert nie odpaliłby outer transaction; bez nested rollback całe N scen
|
||||||
|
# po niej miałoby PendingRollbackError.
|
||||||
|
sp = session.begin_nested()
|
||||||
|
try:
|
||||||
|
if not has_thumb:
|
||||||
|
thumb_added_here = False
|
||||||
|
for src in sources:
|
||||||
|
try:
|
||||||
|
r = browser_get(src.page_url, timeout=10.0, follow_redirects=True)
|
||||||
|
except NET_EXC as e:
|
||||||
|
log.debug("rescrape thumb fetch fail %s: %s", src.page_url, e)
|
||||||
|
continue
|
||||||
|
if r.status_code >= 400:
|
||||||
|
continue
|
||||||
|
thumb = extract_thumbnail_url(r.text)
|
||||||
|
if thumb:
|
||||||
|
# Update tylko źródła z którego pochodzi thumb (single playback).
|
||||||
|
# Wcześniej apply'owalismy do wszystkich siblings — wrong-CDN
|
||||||
|
# cross-attribution (np. xhamster thumb na porntrex entry).
|
||||||
|
# `scene.thumbnail_url` w UI bierze pierwszy z thumb (mobile
|
||||||
|
# find()), więc 1 wystarczy.
|
||||||
|
session.execute(
|
||||||
|
PlaybackSource.__table__.update()
|
||||||
|
.where(PlaybackSource.id == src.id)
|
||||||
|
.where(PlaybackSource.thumbnail_url.is_(None))
|
||||||
|
.values(thumbnail_url=thumb)
|
||||||
|
)
|
||||||
|
thumbs_added += 1
|
||||||
|
thumb_added_here = True
|
||||||
|
break
|
||||||
|
if not thumb_added_here:
|
||||||
|
failures += 1
|
||||||
|
|
||||||
|
if existing_tag_count < _TAG_RESCRAPE_THRESHOLD:
|
||||||
|
chosen = None
|
||||||
|
for tag in _TAG_PRIORITY:
|
||||||
|
for src in sources:
|
||||||
|
if src.origin == f"tube:{tag}":
|
||||||
|
chosen = src
|
||||||
|
break
|
||||||
|
if chosen:
|
||||||
|
break
|
||||||
|
if chosen is None:
|
||||||
|
for src in sources:
|
||||||
|
sitetag_part = src.origin.split(":", 1)[1]
|
||||||
|
if sitetag_part in TAG_EXTRACTORS:
|
||||||
|
chosen = src
|
||||||
|
break
|
||||||
|
if chosen is not None:
|
||||||
|
sitetag_part = chosen.origin.split(":", 1)[1]
|
||||||
|
try:
|
||||||
|
r = browser_get(chosen.page_url, timeout=10.0, follow_redirects=True)
|
||||||
|
if r.status_code < 400:
|
||||||
|
tag_names = extract_tags(sitetag_part, r.text)
|
||||||
|
else:
|
||||||
|
tag_names = []
|
||||||
|
except NET_EXC as e:
|
||||||
|
log.debug("rescrape tags fetch fail %s: %s", chosen.page_url, e)
|
||||||
|
tag_names = []
|
||||||
|
seen_tag_ids: set = set()
|
||||||
|
for name in tag_names:
|
||||||
|
norm = NormalizedTag(name=name, slug=slugify(name), external_id=None)
|
||||||
|
tag = resolve_tag(session, norm=norm)
|
||||||
|
if tag is None or tag.id in seen_tag_ids:
|
||||||
|
continue
|
||||||
|
seen_tag_ids.add(tag.id)
|
||||||
|
stmt = (
|
||||||
|
pg_insert(SceneTag.__table__)
|
||||||
|
.values(scene_id=scene_id, tag_id=tag.id)
|
||||||
|
.on_conflict_do_nothing(index_elements=["scene_id", "tag_id"])
|
||||||
|
)
|
||||||
|
result = session.execute(stmt)
|
||||||
|
if result.rowcount:
|
||||||
|
tags_added += 1
|
||||||
|
sp.commit()
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
sp.rollback()
|
||||||
|
log.warning("rescrape scene %s failed: %s", scene_id, e)
|
||||||
|
failures += 1
|
||||||
|
|
||||||
|
return PerformerRescrapeOut(
|
||||||
|
performer_id=performer_id,
|
||||||
|
canonical_name=perf.canonical_name,
|
||||||
|
scenes_total=scenes_total,
|
||||||
|
scenes_processed=scenes_processed,
|
||||||
|
thumbs_added=thumbs_added,
|
||||||
|
tags_added=tags_added,
|
||||||
|
failures=failures,
|
||||||
|
capped=capped,
|
||||||
|
cap_reason=cap_reason,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/performers/{performer_id}/refresh", response_model=PerformerRefreshOut)
|
||||||
|
def refresh_performer(
|
||||||
|
performer_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> PerformerRefreshOut:
|
||||||
|
"""On-demand search across all tubes dla pojedynczego performera. Synchronous —
|
||||||
|
blokujemy aż search skończy. Mobile pokazuje spinner.
|
||||||
|
|
||||||
|
Rate-guard: jeśli refresh był < 60s temu, zwraca cached result (HTTP 429-style
|
||||||
|
detail). Continuous worker w tle też robi swoje, więc cache jest częsty.
|
||||||
|
"""
|
||||||
|
from datetime import UTC as _UTC, datetime as _dt, timedelta as _td
|
||||||
|
|
||||||
|
perf = session.get(Performer, performer_id)
|
||||||
|
if perf is None:
|
||||||
|
raise HTTPException(status_code=404, detail="performer not found")
|
||||||
|
|
||||||
|
if perf.last_searched_at is not None:
|
||||||
|
elapsed = _dt.now(_UTC) - perf.last_searched_at
|
||||||
|
if elapsed < _td(seconds=60):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail=f"recently searched {int(elapsed.total_seconds())}s ago, try in a bit",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lazy import — performer_driven ma ciężki connector tree
|
||||||
|
from app.scheduler.performer_driven import run_performer_driven
|
||||||
|
|
||||||
|
# NOTE: ten request blokuje request thread API na 30-90s (search across ~25 tubes).
|
||||||
|
# Akceptowalne dla self-hosted single-user. W razie potrzeby dorobić task queue.
|
||||||
|
counters_obj = run_performer_driven(
|
||||||
|
performer_ids=[performer_id],
|
||||||
|
top_n=0,
|
||||||
|
per_performer_limit=200,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update last_searched_at + counter (tak samo jak continuous worker)
|
||||||
|
perf.last_searched_at = _dt.now(_UTC)
|
||||||
|
perf.search_run_count = (perf.search_run_count or 0) + 1
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
new_total = sum(s.get("new", 0) for s in counters_obj.per_source.values())
|
||||||
|
return PerformerRefreshOut(
|
||||||
|
performer_id=performer_id,
|
||||||
|
canonical_name=perf.canonical_name,
|
||||||
|
counters=counters_obj.per_source,
|
||||||
|
new_scenes=new_total,
|
||||||
|
last_searched_at=perf.last_searched_at.isoformat() if perf.last_searched_at else None,
|
||||||
|
)
|
||||||
159
app/api/watch.py
Normal file
159
app/api/watch.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
"""Watch history + continue watching.
|
||||||
|
|
||||||
|
Single-user. Mobile pingu POST /scenes/{id}/progress przy:
|
||||||
|
- Klik Watch (position_sec=0) — wciąga scenę do recent watch
|
||||||
|
- Powrót z MX z ACTION_RESULT (gdy włączone EXTRA_RETURN_RESULT) — z faktyczną pozycją
|
||||||
|
|
||||||
|
Continue watching rail na home: GET /watch/recent?limit=10 → top scen po last_played_at,
|
||||||
|
filtruje dead-finished (>=95% lub flag finished). Mobile pokazuje progress bar
|
||||||
|
(position_sec / duration_sec).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy import desc, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.scenes import _build_scene_out
|
||||||
|
from app.api.schemas import SceneOut
|
||||||
|
from app.auth import require_api_key
|
||||||
|
from app.db import get_session
|
||||||
|
from app.models.play_progress import ScenePlayProgress
|
||||||
|
from app.models.scene import Scene
|
||||||
|
|
||||||
|
router = APIRouter(tags=["watch"], dependencies=[Depends(require_api_key)])
|
||||||
|
|
||||||
|
|
||||||
|
class ProgressIn(BaseModel):
|
||||||
|
position_sec: int = 0
|
||||||
|
duration_sec: int | None = None
|
||||||
|
finished: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class ProgressOut(BaseModel):
|
||||||
|
scene_id: uuid.UUID
|
||||||
|
position_sec: int
|
||||||
|
duration_sec: int | None
|
||||||
|
finished: bool
|
||||||
|
last_played_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/scenes/{scene_id}/progress", response_model=ProgressOut)
|
||||||
|
def upsert_progress(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
body: ProgressIn,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> ProgressOut:
|
||||||
|
if session.get(Scene, scene_id) is None:
|
||||||
|
raise HTTPException(status_code=404, detail="scene not found")
|
||||||
|
|
||||||
|
# PG upsert — eliminuje race condition gdy mobile wysyła progress równolegle
|
||||||
|
# (np. 2 instancje playera lub auto-save + manual save). Wcześniej `get → add →
|
||||||
|
# commit` rzucało IntegrityError(pk_scene_play_progress) przy concurrent writes.
|
||||||
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||||
|
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
position_sec = max(0, body.position_sec)
|
||||||
|
finished = body.finished or (
|
||||||
|
bool(body.duration_sec)
|
||||||
|
and body.duration_sec > 0
|
||||||
|
and position_sec >= int(body.duration_sec * 0.95)
|
||||||
|
)
|
||||||
|
stmt = (
|
||||||
|
pg_insert(ScenePlayProgress)
|
||||||
|
.values(
|
||||||
|
scene_id=scene_id,
|
||||||
|
position_sec=position_sec,
|
||||||
|
duration_sec=body.duration_sec,
|
||||||
|
finished=finished,
|
||||||
|
last_played_at=now,
|
||||||
|
)
|
||||||
|
.on_conflict_do_update(
|
||||||
|
index_elements=["scene_id"],
|
||||||
|
set_={
|
||||||
|
"position_sec": position_sec,
|
||||||
|
# duration_sec: zachowaj istniejący gdy body nie podaje
|
||||||
|
"duration_sec": (
|
||||||
|
body.duration_sec
|
||||||
|
if body.duration_sec is not None
|
||||||
|
else ScenePlayProgress.duration_sec
|
||||||
|
),
|
||||||
|
"finished": finished,
|
||||||
|
"last_played_at": now,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
session.execute(stmt)
|
||||||
|
session.commit()
|
||||||
|
row = session.get(ScenePlayProgress, scene_id)
|
||||||
|
assert row is not None
|
||||||
|
return ProgressOut(
|
||||||
|
scene_id=scene_id,
|
||||||
|
position_sec=row.position_sec,
|
||||||
|
duration_sec=row.duration_sec,
|
||||||
|
finished=row.finished,
|
||||||
|
last_played_at=row.last_played_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/scenes/{scene_id}/progress",
|
||||||
|
status_code=status.HTTP_204_NO_CONTENT,
|
||||||
|
)
|
||||||
|
def remove_progress(
|
||||||
|
scene_id: uuid.UUID,
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
) -> None:
|
||||||
|
row = session.get(ScenePlayProgress, scene_id)
|
||||||
|
if row is None:
|
||||||
|
return
|
||||||
|
session.delete(row)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class WatchEntry(BaseModel):
|
||||||
|
scene: SceneOut
|
||||||
|
position_sec: int
|
||||||
|
duration_sec: int | None
|
||||||
|
finished: bool
|
||||||
|
last_played_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class WatchListOut(BaseModel):
|
||||||
|
items: list[WatchEntry]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/watch/recent", response_model=WatchListOut)
|
||||||
|
def list_recent(
|
||||||
|
session: Annotated[Session, Depends(get_session)],
|
||||||
|
limit: int = Query(default=10, ge=1, le=50),
|
||||||
|
include_finished: bool = Query(default=False),
|
||||||
|
) -> WatchListOut:
|
||||||
|
"""Top-N scen po last_played_at desc. Domyślnie pomija sceny finished
|
||||||
|
(user nie chce widzieć już dograne w continue rail)."""
|
||||||
|
stmt = (
|
||||||
|
select(ScenePlayProgress, Scene)
|
||||||
|
.join(Scene, Scene.id == ScenePlayProgress.scene_id)
|
||||||
|
.order_by(desc(ScenePlayProgress.last_played_at))
|
||||||
|
.limit(limit)
|
||||||
|
)
|
||||||
|
if not include_finished:
|
||||||
|
stmt = stmt.where(ScenePlayProgress.finished.is_(False))
|
||||||
|
|
||||||
|
items: list[WatchEntry] = []
|
||||||
|
for prog, scene in session.execute(stmt).all():
|
||||||
|
items.append(
|
||||||
|
WatchEntry(
|
||||||
|
scene=_build_scene_out(session, scene),
|
||||||
|
position_sec=prog.position_sec,
|
||||||
|
duration_sec=prog.duration_sec,
|
||||||
|
finished=prog.finished,
|
||||||
|
last_played_at=prog.last_played_at,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return WatchListOut(items=items)
|
||||||
46
app/auth.py
Normal file
46
app/auth.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
"""API key authentication.
|
||||||
|
|
||||||
|
Klucz przyjmowany z header `X-API-Key` lub `Authorization: Bearer <key>`.
|
||||||
|
Gdy `settings.api_keys` jest puste — auth jest wyłączony (dev mode).
|
||||||
|
|
||||||
|
Dodatkowo (anti-tamper): gdy `ALLOWED_APP_SIG_HASH` jest ustawione, każdy request
|
||||||
|
musi zawierać `X-App-Signature` z SHA256 (hex) signing certu APK. Mismatch → 403.
|
||||||
|
Re-packaging APK innym keystorem (debug → release) wykryty natychmiast.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import Header, HTTPException, status
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
|
||||||
|
def require_api_key(
|
||||||
|
x_api_key: str | None = Header(default=None, alias="X-API-Key"),
|
||||||
|
authorization: str | None = Header(default=None),
|
||||||
|
x_app_signature: str | None = Header(default=None, alias="X-App-Signature"),
|
||||||
|
) -> None:
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
if settings.app_sig_check_enabled:
|
||||||
|
sig = (x_app_signature or "").strip().lower().replace(":", "")
|
||||||
|
if not sig or sig not in settings.allowed_app_sig_hashes:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_403_FORBIDDEN,
|
||||||
|
detail="invalid or missing app signature",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not settings.auth_enabled:
|
||||||
|
return # local/dev — wszystko otwarte
|
||||||
|
|
||||||
|
candidate: str | None = None
|
||||||
|
if x_api_key:
|
||||||
|
candidate = x_api_key.strip()
|
||||||
|
elif authorization and authorization.lower().startswith("bearer "):
|
||||||
|
candidate = authorization[7:].strip()
|
||||||
|
|
||||||
|
if not candidate or candidate not in settings.api_keys:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="invalid or missing API key",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
|
)
|
||||||
116
app/config.py
Normal file
116
app/config.py
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="ignore", case_sensitive=False)
|
||||||
|
|
||||||
|
database_url: str = Field(
|
||||||
|
default="postgresql+psycopg://goon:goon@localhost:5432/goon",
|
||||||
|
validation_alias="DATABASE_URL",
|
||||||
|
)
|
||||||
|
|
||||||
|
tpdb_api_token: str | None = Field(default=None, validation_alias="TPDB_API_TOKEN")
|
||||||
|
tpdb_base_url: str = Field(
|
||||||
|
default="https://api.theporndb.net", validation_alias="TPDB_BASE_URL"
|
||||||
|
)
|
||||||
|
|
||||||
|
stashdb_api_key: str | None = Field(default=None, validation_alias="STASHDB_API_KEY")
|
||||||
|
stashdb_graphql_url: str = Field(
|
||||||
|
default="https://stashdb.org/graphql", validation_alias="STASHDB_GRAPHQL_URL"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_level: str = Field(default="INFO", validation_alias="LOG_LEVEL")
|
||||||
|
|
||||||
|
|
||||||
|
# Sentry observability — pusty DSN = init no-op (devel/local). Cloud free tier
|
||||||
|
# 5k errors/mies wystarczy dla 1-user app.
|
||||||
|
sentry_dsn: str | None = Field(default=None, validation_alias="SENTRY_DSN")
|
||||||
|
sentry_environment: str = Field(default="dev", validation_alias="SENTRY_ENVIRONMENT")
|
||||||
|
sentry_traces_sample_rate: float = Field(
|
||||||
|
default=0.1, validation_alias="SENTRY_TRACES_SAMPLE_RATE"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_keys_raw: str = Field(default="", validation_alias="API_KEYS")
|
||||||
|
"""Lista API keys oddzielona przecinkami. Pusta = auth wyłączony (tylko dev/local)."""
|
||||||
|
|
||||||
|
allowed_app_sig_hashes_raw: str = Field(default="", validation_alias="ALLOWED_APP_SIG_HASH")
|
||||||
|
"""Whitelist SHA256 (hex) podpisów APK akceptowane przez backend. Każdy request mobile
|
||||||
|
wysyła `X-App-Signature` z hashem signing certu (PackageManager.GET_SIGNING_CERTIFICATES).
|
||||||
|
Pusta = check wyłączony (dev/wstępny rollout). Lista = comma-separated lowercase hex.
|
||||||
|
Re-packaging APK innym keystorem zmienia hash → 403."""
|
||||||
|
|
||||||
|
auto_merge_threshold: float = 0.92
|
||||||
|
review_threshold: float = 0.75
|
||||||
|
fingerprint_hamming_max: int = 5
|
||||||
|
title_token_set_min: int = 88
|
||||||
|
date_window_days: int = 7
|
||||||
|
|
||||||
|
# APScheduler (M5). Każdy 0/None = job wyłączony.
|
||||||
|
sched_tpdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_TPDB_HOURS")
|
||||||
|
sched_stashdb_hours: int = Field(default=6, validation_alias="GOON_SCHED_STASHDB_HOURS")
|
||||||
|
sched_performer_driven_hours: int = Field(
|
||||||
|
default=12, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_HOURS"
|
||||||
|
)
|
||||||
|
sched_performer_driven_top_n: int = Field(
|
||||||
|
default=20, validation_alias="GOON_SCHED_PERFORMER_DRIVEN_TOP_N"
|
||||||
|
)
|
||||||
|
# Continuous worker. interval=15s + max_instances=1 + coalesce=True ⇒ effective rate
|
||||||
|
# = max(15, real_tick_duration). Real tick ~50-80s przy full coverage. Set to 0 to disable.
|
||||||
|
sched_performer_continuous_seconds: int = Field(
|
||||||
|
default=15, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_SECONDS"
|
||||||
|
)
|
||||||
|
sched_performer_continuous_refresh_days: int = Field(
|
||||||
|
default=30, validation_alias="GOON_SCHED_PERFORMER_CONTINUOUS_REFRESH_DAYS"
|
||||||
|
)
|
||||||
|
# Movie ingest — paradisehill (primary) + dooplay mirrory (mangoporn/streamporn/
|
||||||
|
# pandamovies). Każdy connector zapisuje swój `Source` i robi delta od ostatniego
|
||||||
|
# successful run. Set to 0 to disable. Domyślnie 24h: movie sites rosną wolniej
|
||||||
|
# niż tube'y (~5-30 nowych dziennie), nie ma sensu wymiatać częściej.
|
||||||
|
sched_movie_ingest_hours: int = Field(
|
||||||
|
default=24, validation_alias="GOON_SCHED_MOVIE_INGEST_HOURS"
|
||||||
|
)
|
||||||
|
# Browse-latest scheduler: freshporno/porn00/pornxp newest scenes raz dziennie.
|
||||||
|
sched_browse_latest_hours: int = Field(
|
||||||
|
default=24, validation_alias="GOON_SCHED_BROWSE_LATEST_HOURS"
|
||||||
|
)
|
||||||
|
sched_browse_latest_max_pages: int = Field(
|
||||||
|
default=5, validation_alias="GOON_SCHED_BROWSE_LATEST_MAX_PAGES"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
|
||||||
|
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
|
||||||
|
# Free traffic per server: CX22=20TB, CPX21=20TB itd. Overage = €1/TB.
|
||||||
|
hetzner_api_token: str | None = Field(default=None, validation_alias="HETZNER_API_TOKEN")
|
||||||
|
hetzner_server_id: int | None = Field(default=None, validation_alias="HETZNER_SERVER_ID")
|
||||||
|
# Alert thresholds (% of included_traffic) — Sentry severity levels.
|
||||||
|
hetzner_alert_info_pct: int = Field(default=50, validation_alias="HETZNER_ALERT_INFO_PCT")
|
||||||
|
hetzner_alert_warning_pct: int = Field(default=80, validation_alias="HETZNER_ALERT_WARNING_PCT")
|
||||||
|
hetzner_alert_error_pct: int = Field(default=95, validation_alias="HETZNER_ALERT_ERROR_PCT")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def api_keys(self) -> set[str]:
|
||||||
|
return {k.strip() for k in self.api_keys_raw.split(",") if k.strip()}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def auth_enabled(self) -> bool:
|
||||||
|
return bool(self.api_keys)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def allowed_app_sig_hashes(self) -> set[str]:
|
||||||
|
return {
|
||||||
|
h.strip().lower().replace(":", "")
|
||||||
|
for h in self.allowed_app_sig_hashes_raw.split(",")
|
||||||
|
if h.strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def app_sig_check_enabled(self) -> bool:
|
||||||
|
return bool(self.allowed_app_sig_hashes)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
return Settings()
|
||||||
48
app/connectors/__init__.py
Normal file
48
app/connectors/__init__.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
"""Connector registry helpers.
|
||||||
|
|
||||||
|
Lazy factories — importy connectorów wykonują się dopiero w `get_movie_connectors()`
|
||||||
|
żeby uniknąć circular imports (modeles/db). Każdy entry: `(name, class)` w porządku
|
||||||
|
ingestu (primary FIRST, mirrory potem — `resolve_movie` wtedy ma do czego dokleić
|
||||||
|
mirror playback sources).
|
||||||
|
|
||||||
|
## Jak dodać nowe movie site
|
||||||
|
|
||||||
|
1. Napisz subclass `DooplayConnector` w `app/connectors/dooplay.py` (jeśli site używa
|
||||||
|
dooplay/PsyPlay WP theme) — wystarczy `name` + `base_url`. Jeśli inny theme,
|
||||||
|
napisz osobny connector implementujący `BaseMovieConnector.fetch_movies()`.
|
||||||
|
2. Dodaj entry do `_MOVIE_CONNECTORS` poniżej.
|
||||||
|
3. Backend job `_job_movie_ingest` w `app/scheduler/jobs.py` automatycznie weźmie
|
||||||
|
nowy connector przy następnym tick (24h domyślnie).
|
||||||
|
4. Do ad-hoc backfillu: `python -m app.scheduler.worker --once --strategy=movies
|
||||||
|
--performers=<nowa_nazwa>`.
|
||||||
|
|
||||||
|
## Czemu paradisehill first
|
||||||
|
|
||||||
|
Paradisehill jest jedynym sourcem z chapter markerami i pełnym metadata (director,
|
||||||
|
rating, country) → idealnie kanoniczny. Dooplay mirrory rzadko mają chaptery i
|
||||||
|
release_year zwykle pusty. Resolver `resolve_movie` po title-similarity matchuje
|
||||||
|
mirror → primary paradisehill, dodając tylko playback sources (mangoporn:luluvid,
|
||||||
|
:voe, …) które rozpakowują się na bezpośredni stream URL przez
|
||||||
|
`extract_stream_from_hoster`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def get_movie_connectors() -> list[tuple[str, type]]:
|
||||||
|
"""Zwraca listę (name, ConnectorCls) tuples w kolejności ingestu.
|
||||||
|
|
||||||
|
Lazy import — uniknięcie circular import bo connectory zaczepiają db/models.
|
||||||
|
"""
|
||||||
|
from app.connectors.dooplay import (
|
||||||
|
MangopornConnector,
|
||||||
|
PandamoviesConnector,
|
||||||
|
StreampornConnector,
|
||||||
|
)
|
||||||
|
from app.connectors.paradisehill import ParadisehillConnector
|
||||||
|
|
||||||
|
return [
|
||||||
|
("paradisehill", ParadisehillConnector),
|
||||||
|
("streamporn", StreampornConnector),
|
||||||
|
("pandamovies", PandamoviesConnector),
|
||||||
|
("mangoporn", MangopornConnector),
|
||||||
|
]
|
||||||
187
app/connectors/base.py
Normal file
187
app/connectors/base.py
Normal file
|
|
@ -0,0 +1,187 @@
|
||||||
|
"""Kontrakt connectora źródła + neutralne DTO surowych rekordów.
|
||||||
|
|
||||||
|
Connector odpowiada za: paginację, retry, autoryzację, deltę. Zwraca strumień RawScene
|
||||||
|
(z ewentualnymi pre-rozwiniętymi performerami/studiem/tagami w polach inline). Cała
|
||||||
|
mechanika DB i normalizacji żyje wyżej w pipeline'ie ingest.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import abc
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import date, datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
|
||||||
|
class RawTag(BaseModel):
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
external_id: str | None = None
|
||||||
|
name: str
|
||||||
|
slug: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RawStudio(BaseModel):
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
external_id: str | None = None
|
||||||
|
name: str
|
||||||
|
slug: str | None = None
|
||||||
|
parent_external_id: str | None = None
|
||||||
|
parent_name: str | None = None
|
||||||
|
network: str | None = None
|
||||||
|
homepage_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RawPerformer(BaseModel):
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
external_id: str | None = None
|
||||||
|
name: str
|
||||||
|
aliases: list[str] = Field(default_factory=list)
|
||||||
|
gender: str | None = None
|
||||||
|
birth_date: date | None = None
|
||||||
|
country: str | None = None
|
||||||
|
as_alias_in_scene: str | None = None # imię użyte w tej konkretnej scenie (np. „Mia M.")
|
||||||
|
|
||||||
|
|
||||||
|
class RawFingerprint(BaseModel):
|
||||||
|
kind: str # phash | oshash | md5
|
||||||
|
value: str
|
||||||
|
|
||||||
|
|
||||||
|
class RawPlaybackSource(BaseModel):
|
||||||
|
"""Link do odtworzenia sceny z konkretnego tube/agregatora."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
|
||||||
|
origin: str
|
||||||
|
"""Krótka nazwa źródła, np. 'tube:hqpornercom', 'mangoporn:doodstream'."""
|
||||||
|
|
||||||
|
page_url: str
|
||||||
|
"""URL strony tube'a z player'em (deep link)."""
|
||||||
|
|
||||||
|
embed_url: str | None = None
|
||||||
|
stream_url: str | None = None
|
||||||
|
quality: str | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
thumbnail_url: str | None = None
|
||||||
|
animated_thumbnail_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RawScene(BaseModel):
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
|
||||||
|
external_id: str
|
||||||
|
title: str
|
||||||
|
description: str | None = None
|
||||||
|
release_date: date | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
code: str | None = None
|
||||||
|
director: str | None = None
|
||||||
|
url: str | None = None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
performers: list[RawPerformer] = Field(default_factory=list)
|
||||||
|
tags: list[RawTag] = Field(default_factory=list)
|
||||||
|
fingerprints: list[RawFingerprint] = Field(default_factory=list)
|
||||||
|
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
|
||||||
|
|
||||||
|
cross_source_refs: dict[str, str] = Field(default_factory=dict)
|
||||||
|
"""Mapowanie source_name → external_id deklarowane przez to źródło. Używane do path 2
|
||||||
|
w resolverze (cross-source UUID match). Klucz zgadza się z `Source.name` w DB
|
||||||
|
(np. 'tpdb', 'stashdb')."""
|
||||||
|
|
||||||
|
raw: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
"""Oryginalny payload z API — leci do external_records.raw."""
|
||||||
|
|
||||||
|
|
||||||
|
class BaseConnector(abc.ABC):
|
||||||
|
"""Każde źródło dziedziczy. `kind` mapuje 1:1 na SourceKind w DB."""
|
||||||
|
|
||||||
|
kind: SourceKind
|
||||||
|
name: str
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def fetch_scenes(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Yield po jednej scenie. `since` to delta filter (opcjonalna, fallback do full)."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Movies — odrębny encja od scen, ale ten sam wzorzec connectorów
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class RawMovieChapter(BaseModel):
|
||||||
|
"""Pojedynczy rozdział filmu (movies czasem dzielą się na "Part 1/2/3" itp.).
|
||||||
|
|
||||||
|
Identyfikatory chaptera nie są kanonizowane między źródłami — są lokalne dla movie,
|
||||||
|
indeksowane przez `chapter_index`. Może linkować do separate scene (jeśli ta scena
|
||||||
|
znana z TPDB/StashDB) — tym zajmuje się normalizator wyżej."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
|
||||||
|
chapter_index: int
|
||||||
|
title: str | None = None
|
||||||
|
start_sec: int | None = None
|
||||||
|
end_sec: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RawMovie(BaseModel):
|
||||||
|
"""Surowy film z connectora — odpowiednik RawScene dla movies.
|
||||||
|
|
||||||
|
Performers / studio / tags reusable z RawPerformer / RawStudio / RawTag (te same
|
||||||
|
typy w obu pipelinach). Playback sources to lista mirrorów odtwarzania (paradisehill
|
||||||
|
primary, ewentualnie inne tube'y).
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
|
||||||
|
external_id: str
|
||||||
|
title: str
|
||||||
|
description: str | None = None
|
||||||
|
release_year: int | None = None
|
||||||
|
release_date: date | None = None
|
||||||
|
duration_sec: int | None = None
|
||||||
|
director: str | None = None
|
||||||
|
country: str | None = None
|
||||||
|
rating: float | None = None
|
||||||
|
poster_url: str | None = None
|
||||||
|
backdrop_url: str | None = None
|
||||||
|
url: str | None = None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
performers: list[RawPerformer] = Field(default_factory=list)
|
||||||
|
tags: list[RawTag] = Field(default_factory=list)
|
||||||
|
chapters: list[RawMovieChapter] = Field(default_factory=list)
|
||||||
|
playback_sources: list[RawPlaybackSource] = Field(default_factory=list)
|
||||||
|
|
||||||
|
cross_source_refs: dict[str, str] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
raw: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseMovieConnector(abc.ABC):
|
||||||
|
"""Connector dla source'a movies (paradisehill, psyplay, wp_movies).
|
||||||
|
|
||||||
|
Symetrycznie do BaseConnector ale yielduje RawMovie. Każde źródło zna własną
|
||||||
|
paginację i format ID — konwerter wyżej (resolver) dba o dedup między źródłami.
|
||||||
|
"""
|
||||||
|
|
||||||
|
kind: SourceKind
|
||||||
|
name: str
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def fetch_movies(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawMovie]:
|
||||||
|
"""Yield po jednym filmie. `since` opcjonalne, fallback do full crawl."""
|
||||||
|
raise NotImplementedError
|
||||||
166
app/connectors/direct_scrapers/__init__.py
Normal file
166
app/connectors/direct_scrapers/__init__.py
Normal file
|
|
@ -0,0 +1,166 @@
|
||||||
|
"""Direct tube scrapers.
|
||||||
|
|
||||||
|
Każdy scraper hit'uje tube bezpośrednio HTTPm — różne tube'y to różne rate limit
|
||||||
|
budgets, więc mogą iść równolegle. Wszystkie feedują sceny do tej samej
|
||||||
|
`Source(name='pornapp')` (legacy nazwa — kept for DB compat) z external_id
|
||||||
|
`f"{sitetag}:{url}"`. Resolver mergeuje idempotentnie po tym kluczu.
|
||||||
|
|
||||||
|
Search-based ścieżka (per performer name); category browse'ng przez `categoriesUrl`
|
||||||
|
overrides w pornapp connector był specyficzny dla porn-app API i zostanie usunięty.
|
||||||
|
|
||||||
|
UWAGA — speculative scrapers: większość aggregator + special tubes (xmoviesforyou,
|
||||||
|
watchporn, siska, porn4days, porndish, xxxfreewatch, latestleaks, mypornerleak,
|
||||||
|
porndittcom, perverzija, fpoxxx, ...) ma URL templates + regex'y oparte na typowych
|
||||||
|
WordPress conventions. Wymagają post-deploy verification — gdy któryś nie zwraca
|
||||||
|
wyników, sprawdź real search HTML + popraw template/regex w odpowiednim pliku.
|
||||||
|
"""
|
||||||
|
from app.connectors.direct_scrapers._browse_base import BaseBrowseScraper
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.connectors.direct_scrapers.eporner import EpornerScraper
|
||||||
|
from app.connectors.direct_scrapers.fpoxxx import FpoxxxScraper
|
||||||
|
from app.connectors.direct_scrapers.hdporn92 import HDPorn92Scraper # noqa: F401 — kept for backref; disabled
|
||||||
|
from app.connectors.direct_scrapers.hqporner import HQPornerScraper
|
||||||
|
from app.connectors.direct_scrapers.latestleaks import LatestLeaksScraper
|
||||||
|
from app.connectors.direct_scrapers.latestpornvideo import LatestPornVideoScraper
|
||||||
|
from app.connectors.direct_scrapers.mypornerleak import MyPornerLeakScraper
|
||||||
|
from app.connectors.direct_scrapers.perverzija import PerverzijaScraper
|
||||||
|
from app.connectors.direct_scrapers.porn4days import Porn4DaysScraper
|
||||||
|
from app.connectors.direct_scrapers.pornditt import PornDittScraper
|
||||||
|
from app.connectors.direct_scrapers.porndish import PornDishScraper
|
||||||
|
from app.connectors.direct_scrapers.pornhat import PornHatScraper # noqa: F401 — kept for backref; ingest disabled
|
||||||
|
from app.connectors.direct_scrapers.pornhub import PornHubScraper
|
||||||
|
from app.connectors.direct_scrapers.porntrex import PornTrexScraper
|
||||||
|
from app.connectors.direct_scrapers.redtube import RedTubeScraper
|
||||||
|
from app.connectors.direct_scrapers.siska import SiskaScraper
|
||||||
|
from app.connectors.direct_scrapers.sxyland import SxyLandScraper
|
||||||
|
from app.connectors.direct_scrapers.sxyprn import SxyPrnScraper
|
||||||
|
from app.connectors.direct_scrapers.watchporn import WatchPornScraper
|
||||||
|
from app.connectors.direct_scrapers.xhamster import XHamsterScraper
|
||||||
|
from app.connectors.direct_scrapers.xmoviesforyou import XMoviesForYouScraper
|
||||||
|
from app.connectors.direct_scrapers.xnxx import XnxxScraper
|
||||||
|
from app.connectors.direct_scrapers.xvideos import XVideosScraper
|
||||||
|
from app.connectors.direct_scrapers.xxxfreewatch import XxxFreeWatchScraper # noqa: F401 — kept for backref; delisted
|
||||||
|
from app.connectors.direct_scrapers.youporn import YouPornScraper
|
||||||
|
from app.connectors.direct_scrapers.zerodayxx import ZeroDayXXScraper
|
||||||
|
|
||||||
|
ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
|
# Existing 4 (verified, in production)
|
||||||
|
HQPornerScraper,
|
||||||
|
# HDPorn92Scraper — wyłączony 2026-05-18. Scene pages to SEO shell: ZERO player iframe
|
||||||
|
# (tylko happyleafmotion ads), JS hijackuje wszystkie kliki → `go.rmishe.com/smartpop/...`
|
||||||
|
# popunder redirect. Mobile WebView page-as-hoster pokazuje ad redirect zamiast video.
|
||||||
|
# 33,598 playback_sources mass-marked dead, 27,374 solo-orphan scenes deleted.
|
||||||
|
SxyLandScraper,
|
||||||
|
# ZeroDayXXScraper — wyłączony 2026-05-12 (source quality report): 25,596 scen, 0.1% canonical
|
||||||
|
# match. Slug-concat tytuły (`bella reese big butt ready to be filled with cum analized`) bez
|
||||||
|
# `[Studio]` lub `Studio - Perf - Title` prefixu (parse rate 3%) → resolver nie ma żadnego
|
||||||
|
# signalu do matchu. Wraps watchporn ale dziedziczy stripped metadata. Solo orphany usunięte
|
||||||
|
# (~21k scen) — plik scrapera + extractor zostają (istniejące playback_sources nadal się
|
||||||
|
# resolvują).
|
||||||
|
# Mainstream (URL templates well-known)
|
||||||
|
# PornHubScraper — wyłączony 2026-05-12 (analiza źródeł): 23,750 scen scrapnietych,
|
||||||
|
# tylko 105 (0.4%) match z TPDB/StashDB. PH hostuje głównie własne shortened
|
||||||
|
# clipy + amateur upload — nigdy nie zmatchują studio canonical content. Plik
|
||||||
|
# zostaje (extractor `pornhubcom` używa go w playback resolve dla istniejących
|
||||||
|
# playback_sources).
|
||||||
|
# RedTubeScraper — wyłączony 2026-05-12 (analiza źródeł): 20,127 scen, 82 match
|
||||||
|
# (0.4%). Same powody co PH (skrócone clipy + amateur upload).
|
||||||
|
XVideosScraper,
|
||||||
|
XnxxScraper,
|
||||||
|
XHamsterScraper,
|
||||||
|
YouPornScraper,
|
||||||
|
PornTrexScraper,
|
||||||
|
EpornerScraper,
|
||||||
|
# Aggregators (WordPress-like ?s= search; speculative — verify post-deploy)
|
||||||
|
# XMoviesForYouScraper — wyłączony 2026-05-12 (post audit fix). 100% scen serwuje
|
||||||
|
# streamtape (DEAD_HOSTER_RE — malware drive-by .reg) + opcjonalnie playmogo/mixdrop.
|
||||||
|
# Mixdrop zrebrandował na m1xdrop.bz, yt-dlp out-of-date, packer/JS extract = fail.
|
||||||
|
# Playmogo = DoodStream CAPTCHA. Porn-app sam olewa xmoviesforyou (brak handlera w
|
||||||
|
# jadx). 1,321 solo-orphan scen.
|
||||||
|
# WatchPornScraper — wyłączony 2026-05-12 (user bug-report). Wszystkie iframes to
|
||||||
|
# DoodStream variants (playmogo/d0000d/dooood/mivalyo) z CAPTCHA gate. WebView na
|
||||||
|
# mobile = black screen (player JS nie inicjalizuje się przez Turnstile). 16%
|
||||||
|
# scen solo (no backup tube), 84% multi-source — user może użyć innego tube. yt-dlp
|
||||||
|
# nie wspiera DoodStream ("Piracy"), własny resolver TBD jeśli warto.
|
||||||
|
# SiskaScraper — wyłączony 2026-05-16 (filemoon shutdown). Każda siska scena
|
||||||
|
# embeduje filemoon iframe; filemoon.to/sx/nl serwują od ~2026-05 placeholder
|
||||||
|
# "Byse Frontend" SPA bez player JS. 14,839 playback_sources mass-marked dead.
|
||||||
|
# Plik scrapera + extractor zostają (mobile spróbuje resolve → DEAD_HOSTER_RE
|
||||||
|
# filemoon blacklist → None → 503 — fine, te scenes są też dead_at-filtered).
|
||||||
|
# SiskaScraper,
|
||||||
|
# Porn4DaysScraper — wyłączony 2026-05-12 (post audit fix). 100% scen na streamtape
|
||||||
|
# only (DEAD_HOSTER_RE blacklist - malware drive-by .reg downloads). SERVER1_URL =
|
||||||
|
# streamtape, brak SERVER2/SERVER3 backup. Porn-app sam olewa porn4days. 10,346
|
||||||
|
# solo-orphan scen.
|
||||||
|
PornDishScraper,
|
||||||
|
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
||||||
|
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
||||||
|
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
||||||
|
LatestPornVideoScraper,
|
||||||
|
# LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0%
|
||||||
|
# canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany
|
||||||
|
# usunięte (~15k scen).
|
||||||
|
MyPornerLeakScraper,
|
||||||
|
# Added 2026-05-12 (theporndude survey): jeden z 14 free tubes na liście który
|
||||||
|
# zwraca consistent search results. KVS engine, slug-aware scene URLs. Mostly
|
||||||
|
# orphan ingest (auto-screenshots, no canonical phash match — sprawdzone), ale
|
||||||
|
# może łapać sceny popularnych performerów których jeszcze nie mamy w TPDB.
|
||||||
|
# PornHatScraper — wyłączony 2026-05-18. 9,799 scen, 0.2% canonical match, 100% solo-orphan.
|
||||||
|
# Pure orphan factory — auto-screenshot thumbs nie matchują phash do canonical, slug tytuły
|
||||||
|
# nie matchują rapidfuzz, brak duration/date signals. KEEP `pornhatcom` extractor i istniejące
|
||||||
|
# playback_sources żywe — mobile może je odtwarzać; disable tylko future ingest.
|
||||||
|
# PornDittScraper — wyłączony 2026-05-12 (bug-report 64356e9b). Każdy link
|
||||||
|
# produkował nową Scene row zamiast matchować do istniejącej kanonicznej
|
||||||
|
# (TPDB/StashDB) bo pornditt ma weak signal: title + cz. performera, brak
|
||||||
|
# fingerprintu/duration/date → composite_score zawsze poniżej auto_merge
|
||||||
|
# threshold (0.92). Plik scrapera + extractor zostają (istniejące playback_sources
|
||||||
|
# nadal się resolvują, _REGISTRY w app/extractors/__init__.py odpala
|
||||||
|
# `porndittcom` → _embed_iframe.extract). Re-enable wymaga albo
|
||||||
|
# "alternative-source mode" w resolverze (match-only, never create new),
|
||||||
|
# albo bogatszej extracji metadanych (duration + fingerprint).
|
||||||
|
# Special
|
||||||
|
SxyPrnScraper,
|
||||||
|
PerverzijaScraper,
|
||||||
|
FpoxxxScraper,
|
||||||
|
]
|
||||||
|
|
||||||
|
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
||||||
|
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
|
||||||
|
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
|
||||||
|
# raz dziennie, pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
|
||||||
|
# `app/scheduler/browse_latest.py`.
|
||||||
|
#
|
||||||
|
# **Pilot results (2026-05-12):**
|
||||||
|
# - ShyfapScraper: 0/23 match (0%) — robi własne thumbnails ≠ canonical
|
||||||
|
# (phash Hamming 12-16). Plus rebranduje tytuły. **Wyłączony.**
|
||||||
|
# - FreshpornoScraper: 39/59 match (66%) — hot-linkuje studio thumbnaile
|
||||||
|
# (phash Hamming 0). Oryginalne tytuły + channels=studio 1:1. **Aktywny.**
|
||||||
|
from app.connectors.direct_scrapers.freshporno import FreshpornoScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.porn00 import Porn00Scraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.pornxp import PornXPScraper # noqa: E402
|
||||||
|
from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F401
|
||||||
|
|
||||||
|
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
|
FreshpornoScraper,
|
||||||
|
# PornXPScraper — pilot 2026-05-17 (20 scen): studio 100%, performer 95%,
|
||||||
|
# release_date 100%, duration 100%, stream_url 100%, phash 100%. Najlepsze
|
||||||
|
# sygnały spośród browse-mode scraperów. Stream direct mp4 (sv.porn-xp.com)
|
||||||
|
# 360/720 quality. Release year z `Released: <year>` na detail.
|
||||||
|
PornXPScraper,
|
||||||
|
# Porn00Scraper — pilot 2026-05-17 (16 scen): brak studio (0%) + brak release
|
||||||
|
# date (0%) ALE performer 100%, duration 100%, stream_url 100% (KVS video_alt_url
|
||||||
|
# 720p). Tytuł zachowuje studio prefix ("Studio Title - Scene Name") → title
|
||||||
|
# fuzzy match (rapidfuzz token_set_ratio) może załapać canonical. Monitorować.
|
||||||
|
Porn00Scraper,
|
||||||
|
# ShyfapScraper — wyłączony 2026-05-12 (pilot fail, 0% match — orphan factory).
|
||||||
|
# Follow-up: dorobić te tubey i sprawdzić phash distance:
|
||||||
|
# - fullmovies.xxx (channel/network/pornstars/categories, brak duration)
|
||||||
|
# - 4k69.com + hdporn.gg (klony freshporno — prawdopodobnie ten sam phash hit rate)
|
||||||
|
]
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseDirectTubeScraper",
|
||||||
|
"BaseBrowseScraper",
|
||||||
|
"ALL_DIRECT_SCRAPERS",
|
||||||
|
"ALL_BROWSE_SCRAPERS",
|
||||||
|
]
|
||||||
195
app/connectors/direct_scrapers/_browse_base.py
Normal file
195
app/connectors/direct_scrapers/_browse_base.py
Normal file
|
|
@ -0,0 +1,195 @@
|
||||||
|
"""BaseBrowseScraper — latest-vids browse mode (vs search-by-performer).
|
||||||
|
|
||||||
|
Wzorzec: tube'y typu shyfap/freshporno/porn00/fullmovies/pornxp mają bogatą
|
||||||
|
metadata (title, studio, performers, tags, duration, release_date, description)
|
||||||
|
na detail page'u — wystarczy do canonical fuzzy match w resolverze. Browse mode
|
||||||
|
iteruje "latest" page (sorted by upload date) i fetchuje detail per scene.
|
||||||
|
|
||||||
|
Różnica vs `BaseSearchScraper`:
|
||||||
|
- **search**: tube wyszukuje sceny po performer name (dla performer-driven
|
||||||
|
backfill). Wymaga znanego performera.
|
||||||
|
- **browse**: tube listuje newest scenes (latest-vids endpoint). Nie wymaga
|
||||||
|
żadnego query — chodzi o świeże sceny independent of performer state.
|
||||||
|
|
||||||
|
Browse jest komplementarny do search:
|
||||||
|
- search łapie sceny dla **znanych performerów** (TPDB/StashDB → tube)
|
||||||
|
- browse łapie **świeże sceny** których performer może być new dla nas
|
||||||
|
(nowicjuszka w branży nie jeszcze w TPDB → mamy ją z browse → później
|
||||||
|
canonical TPDB ingest mergeuje)
|
||||||
|
|
||||||
|
Subclass dostarcza HTML parsing (listing → scene URLs + detail → RawScene).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import abc
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
|
"""Subclass dostarcza listing/detail parsing. Base flow:
|
||||||
|
1. for page in 1..max_pages:
|
||||||
|
2. GET listing_url(page)
|
||||||
|
3. extract scene URLs
|
||||||
|
4. for each URL:
|
||||||
|
5. GET scene detail page
|
||||||
|
6. parse → RawScene with rich metadata
|
||||||
|
7. yield
|
||||||
|
"""
|
||||||
|
|
||||||
|
_timeout: float = 30.0
|
||||||
|
"""HTTP timeout per request."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
"""URL listing page'a 'latest-vids' (page 1 = newest)."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
"""Lista absolutnych URL-i scen z listing HTML, w kolejności od najnowszej."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
"""Parsuj scene detail HTML → RawScene z metadata.
|
||||||
|
|
||||||
|
Zwraca None gdy scena niedostępna / parse fail — caller pominie ten URL,
|
||||||
|
nie aborti całe browse."""
|
||||||
|
|
||||||
|
def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]:
|
||||||
|
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
|
||||||
|
|
||||||
|
Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno
|
||||||
|
~20 scen/page). Schedulowane raz dziennie → catch-up po 24h przerwie.
|
||||||
|
|
||||||
|
Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy
|
||||||
|
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
|
||||||
|
same N scen pojawia się przez kilka dni.
|
||||||
|
"""
|
||||||
|
# search() nie jest implementowany przez subclass dla browse-only tube'ów —
|
||||||
|
# `BaseDirectTubeScraper.search` to abstrakt, więc dodajemy stub żeby
|
||||||
|
# przepuścić abc, ale faktyczna ścieżka pracy idzie przez latest_scenes().
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
url = self._listing_url(page)
|
||||||
|
try:
|
||||||
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
html = res.text if hasattr(res, "text") else res
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
|
||||||
|
break
|
||||||
|
|
||||||
|
urls = self._extract_scene_urls(html)
|
||||||
|
if not urls:
|
||||||
|
log.info("%s browse: empty listing page %d, stopping", self.sitetag, page)
|
||||||
|
break
|
||||||
|
|
||||||
|
log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls))
|
||||||
|
for scene_url in urls:
|
||||||
|
try:
|
||||||
|
res = browser_get(scene_url, timeout=self._timeout)
|
||||||
|
detail_html = res.text if hasattr(res, "text") else res
|
||||||
|
except Exception as e:
|
||||||
|
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = self._parse_detail(scene_url, detail_html)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if raw is not None:
|
||||||
|
yield raw
|
||||||
|
|
||||||
|
# Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only
|
||||||
|
# tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y
|
||||||
|
# które chcą *oba* tryby mogą override'ować search() osobno.
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
return iter(())
|
||||||
|
|
||||||
|
|
||||||
|
_META_RE_CACHE: dict[str, re.Pattern[str]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
_PHASH_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_thumbnail_phash(thumbnail_url: str, *, referer: str | None = None, timeout: float = 15.0) -> str | None:
|
||||||
|
"""Download thumbnail + return 64-bit perceptual hash (16-char hex) lub None.
|
||||||
|
|
||||||
|
Format pasuje do `SceneFingerprint.value` w DB (TPDB/StashDB importują ten sam
|
||||||
|
8x8 phash). Resolver Path 3 `find_by_phash_within` matchuje Hamming ≤5 (default).
|
||||||
|
|
||||||
|
Wymaga lazy importu `imagehash`/`PIL` — żeby moduł browse_base importował się
|
||||||
|
nawet gdy te lib-y są niedostępne (graceful degradation: phash=None → resolver
|
||||||
|
spadnie do composite scoring, jak gdyby fingerprintu nie było).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import imagehash
|
||||||
|
except ImportError:
|
||||||
|
log.warning("imagehash/Pillow nie zainstalowane — phash skipped")
|
||||||
|
return None
|
||||||
|
|
||||||
|
headers = {"User-Agent": _PHASH_UA}
|
||||||
|
if referer:
|
||||||
|
headers["Referer"] = referer
|
||||||
|
try:
|
||||||
|
with httpx.Client(timeout=timeout, follow_redirects=True) as c:
|
||||||
|
r = c.get(thumbnail_url, headers=headers)
|
||||||
|
if r.status_code != 200 or not r.content:
|
||||||
|
return None
|
||||||
|
img = Image.open(io.BytesIO(r.content))
|
||||||
|
# phash domyślnie hash_size=8 → 64-bit hash → 16 hex chars. Mode 'L' (greyscale)
|
||||||
|
# robi to wewnętrznie, ale niektóre webp/animated mogą mieć multi-frame —
|
||||||
|
# convert() bierze pierwszą klatkę, którą imagehash i tak zredukuje do grey.
|
||||||
|
return str(imagehash.phash(img.convert("RGB")))
|
||||||
|
except Exception as e:
|
||||||
|
log.info("phash compute failed for %s: %s", thumbnail_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def meta_content(html: str, *, property: str | None = None, name: str | None = None) -> str | None:
|
||||||
|
"""Wyciąga zawartość <meta property=X content=Y> lub <meta name=X content=Y>.
|
||||||
|
|
||||||
|
Standardowy helper dla scraperów które używają OpenGraph / ya:ovs / itp.
|
||||||
|
Cache compiled regex w module-scope dict (te same selectory powtarzają się).
|
||||||
|
|
||||||
|
NB: separate patterns dla `"..."` i `'...'` content quote — wcześniej jeden
|
||||||
|
`[^"\']*` regex tnął title po wewnętrznym apostrofie (np. `<meta content="She's So Insatiable">`
|
||||||
|
→ `She`, bug-report 2026-05-20). Teraz matchujemy dokładnie ten sam quote co opening.
|
||||||
|
"""
|
||||||
|
key = f"prop:{property}" if property else f"name:{name}"
|
||||||
|
if key not in _META_RE_CACHE:
|
||||||
|
attr = "property" if property else "name"
|
||||||
|
val = re.escape(property or name or "")
|
||||||
|
# double-quoted content (HTML standard) — preferred
|
||||||
|
# Pattern: <meta property="X" content="...inner..." > — inner allows apostrophes
|
||||||
|
_META_RE_CACHE[key] = re.compile(
|
||||||
|
rf'<meta[^>]+{attr}=["\']{val}["\'][^>]*?content="([^"]*)"'
|
||||||
|
rf'|<meta[^>]+{attr}=["\']{val}["\'][^>]*?content=\'([^\']*)\'',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
m = _META_RE_CACHE[key].search(html)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
val = m.group(1) if m.group(1) is not None else m.group(2)
|
||||||
|
return val.strip() if val else None
|
||||||
238
app/connectors/direct_scrapers/_search_base.py
Normal file
238
app/connectors/direct_scrapers/_search_base.py
Normal file
|
|
@ -0,0 +1,238 @@
|
||||||
|
"""BaseSearchScraper — shared search-page HTML scraping logika.
|
||||||
|
|
||||||
|
Wzorzec stosowany przez wszystkie tube'y discovery scrapers:
|
||||||
|
1. Build search URL z `_search_url_template` (formatowane query+page).
|
||||||
|
2. Fetch HTML curl_cffi.
|
||||||
|
3. Match `_scene_url_re` (regex z grupą `url` lub group(1) jako scene URL,
|
||||||
|
opcjonalnie `slug` lub `id` jako tytuł source).
|
||||||
|
4. Filtruj wyniki po query tokens (slug musi zawierać ≥1 token z query) —
|
||||||
|
fuzzy search tube'ów często zwraca niezwiązane wyniki.
|
||||||
|
5. Yield RawScene z `external_id=f"{sitetag}:{scene_url}"`.
|
||||||
|
|
||||||
|
Subclass override:
|
||||||
|
- `sitetag: str` — np. "pornhubcom"
|
||||||
|
- `_search_url_template: str` — z `{query}` i `{page}` placeholderami
|
||||||
|
- `_scene_url_re: re.Pattern[str]` — regex z named group `url` (scene URL)
|
||||||
|
- `_title_from_match(match) -> str` — opcjonalny override (default: derive z URL slug)
|
||||||
|
- `_token_filter_text(match) -> str` — co testować na query tokens (default: cała URL)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
|
||||||
|
# Image src extraction: matches src, data-src, data-original, data-lazy-src, data-lazy
|
||||||
|
# (lazy-load lib variants). Wymaga rozszerzenia obrazka żeby ograniczyć false positives
|
||||||
|
# (sprite icons, spinners) — JPG/PNG/WEBP są ~ jedynymi formatami które tube'y używają
|
||||||
|
# dla scene thumbnails.
|
||||||
|
_IMG_SRC_RE = re.compile(
|
||||||
|
r'<img[^>]+(?:src|data-src|data-original|data-lazy-src|data-lazy)=["\']'
|
||||||
|
r'((?://|https?://)[^"\']+\.(?:jpg|jpeg|png|webp|gif)[^"\']*)',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSearchScraper(BaseDirectTubeScraper):
|
||||||
|
"""Subclass dostarcza URL template + regex; reszta scraping flow shared.
|
||||||
|
|
||||||
|
Domyślny user agent / headers wystarczą dla ~większości tubes; te które wymagają
|
||||||
|
specyficznych (np. CF protected) override'ują `_search_headers()` lub fetch całość.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: Format URL search page'a, z `{query}` (quote_plus'ed) + `{page}` (int).
|
||||||
|
_search_url_template: str = ""
|
||||||
|
|
||||||
|
#: Regex matchujący scene URL w search HTML. Wymagana grupa `url` (full scene URL),
|
||||||
|
#: opcjonalna grupa `slug` (do title derivation gdy dostępny w URL).
|
||||||
|
_scene_url_re: re.Pattern[str] = re.compile(r"$^") # placeholder — subclass override
|
||||||
|
|
||||||
|
#: Minimalna długość tokena query do filtrowania wyników (krótsze ignorujemy żeby
|
||||||
|
#: nie matchowały niezwiązanych slugów).
|
||||||
|
_query_token_min_len: int = 3
|
||||||
|
|
||||||
|
#: Search HTTP timeout.
|
||||||
|
_timeout: float = 30.0
|
||||||
|
|
||||||
|
#: Slugi do odrzucenia (URL-e nawigacyjne / footer linki które matchują regex
|
||||||
|
#: ale nie są scenami). Przydatne dla WordPress-like tubes gdzie scene URL
|
||||||
|
#: pattern (`<host>/<slug>/`) zbiega się z `/categories/`, `/actors/` itp.
|
||||||
|
_nav_slug_blacklist: frozenset[str] = frozenset({
|
||||||
|
"actors", "actor", "actress", "categories", "category", "tags", "tag",
|
||||||
|
"feed", "dmca", "contact-us", "contact", "comments", "wp-content",
|
||||||
|
"wp-admin", "wp-includes", "wp-login.php", "page", "?filter", "?s",
|
||||||
|
"about", "about-us", "privacy", "privacy-policy", "tos", "terms",
|
||||||
|
"2257", "18-u-s-c-2257", "sitemap", "sitemap.xml",
|
||||||
|
})
|
||||||
|
|
||||||
|
#: Window (chars) wokół scene URL match, w którym szukamy `<img>` jako thumbnail.
|
||||||
|
#: WordPress-like tubes mają thumb w `<a href="..."><img src="...thumb.jpg"></a>` —
|
||||||
|
#: ±800 chars łapie ten pattern niezawodnie.
|
||||||
|
_thumbnail_window: int = 800
|
||||||
|
|
||||||
|
def _scene_url_from_match(self, m: re.Match[str]) -> str:
|
||||||
|
"""Domyślnie group(1) — subclass override gdy regex używa named groups inaczej."""
|
||||||
|
try:
|
||||||
|
return m.group("url")
|
||||||
|
except IndexError:
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
def _slug_from_match(self, m: re.Match[str], scene_url: str) -> str:
|
||||||
|
"""Slug do filtrowania query tokens + derivation tytułu. Default: ostatni segment URL.
|
||||||
|
|
||||||
|
Subclass override gdy regex daje explicit named group `slug`.
|
||||||
|
"""
|
||||||
|
if "slug" in m.groupdict():
|
||||||
|
slug = m.group("slug")
|
||||||
|
if slug:
|
||||||
|
return slug
|
||||||
|
# Fallback: parsuj URL
|
||||||
|
path = urllib.parse.urlparse(scene_url).path.rstrip("/")
|
||||||
|
return path.split("/")[-1] if path else ""
|
||||||
|
|
||||||
|
def _title_from_slug(self, slug: str) -> str:
|
||||||
|
return slug.replace("_", " ").replace("-", " ").strip()
|
||||||
|
|
||||||
|
def _format_query_for_url(self, query: str) -> str:
|
||||||
|
"""Default: URL-encode (spaces → `+`). Subclass override gdy tube wymaga
|
||||||
|
innego formatu — np. KVS-style sites użyją slug (spaces → `-`).
|
||||||
|
"""
|
||||||
|
return urllib.parse.quote_plus(query.strip())
|
||||||
|
|
||||||
|
def _fetch_scene_metadata(
|
||||||
|
self, scene_url: str
|
||||||
|
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
||||||
|
"""Optional hook — subclass może override żeby fetch'ować scene detail page
|
||||||
|
i wyciągnąć studio/performerów/tagi. Default zwraca None (skip detail fetch).
|
||||||
|
|
||||||
|
Wywoływane PER SCENE w `search()` — dodaje +1 HTTP request per match. Subclass
|
||||||
|
powinien rzucić wyjątki swobodnie, base łapie i kontynuuje bez metadata.
|
||||||
|
|
||||||
|
Returns: (studio, performers, tags). Każde może być None / pusta lista.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
if not self._search_url_template:
|
||||||
|
raise NotImplementedError(f"{type(self).__name__}._search_url_template not set")
|
||||||
|
|
||||||
|
q = self._format_query_for_url(query)
|
||||||
|
url = self._search_url_template.format(query=q, page=page)
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=self._timeout)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s search fetch failed: %s", self.sitetag, e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
log.debug("%s search %s status=%d", self.sitetag, url, r.status_code)
|
||||||
|
return
|
||||||
|
|
||||||
|
query_tokens = {
|
||||||
|
tok for tok in query.lower().split() if len(tok) >= self._query_token_min_len
|
||||||
|
}
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for m in self._scene_url_re.finditer(r.text):
|
||||||
|
scene_url = self._scene_url_from_match(m).strip()
|
||||||
|
if scene_url.startswith("//"):
|
||||||
|
scene_url = "https:" + scene_url
|
||||||
|
elif scene_url.startswith("/"):
|
||||||
|
# Relative URL — prefix host z search URL.
|
||||||
|
base = urllib.parse.urlparse(url)
|
||||||
|
scene_url = f"{base.scheme}://{base.netloc}{scene_url}"
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
|
||||||
|
slug = self._slug_from_match(m, scene_url)
|
||||||
|
slug_lower = slug.lower()
|
||||||
|
if slug_lower in self._nav_slug_blacklist:
|
||||||
|
continue
|
||||||
|
# Strict: WSZYSTKIE query tokens muszą być w slug. Wcześniej `any()`
|
||||||
|
# przepuszczał scenę gdy choć jeden token był w slug — dla performera
|
||||||
|
# "Ava Koxxx" (query="ava koxxx") wszystkie sceny z "ava-*" slug
|
||||||
|
# (Ava Devine, Ava Addams itp.) były labelowane jako "Ava Koxxx",
|
||||||
|
# bo `any("ava" in slug)` =True. User reports: scena "ava devine
|
||||||
|
# gangbanged..." miała Ava Koxxx w DB. Fix: `all()` — slug musi
|
||||||
|
# zawierać każdy ≥3-char token z imienia performera.
|
||||||
|
if query_tokens and not all(tok in slug_lower for tok in query_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = self._title_from_slug(slug)
|
||||||
|
|
||||||
|
# Thumbnail: search ±N chars around scene_url match for nearest <img src=>.
|
||||||
|
# Większość tubes ma `<a href="<scene>"><img src="<thumb>"></a>` lub flat
|
||||||
|
# `<img src=><a href=>` — window 800 obejmuje oba.
|
||||||
|
window_start = max(0, m.start() - self._thumbnail_window)
|
||||||
|
window_end = min(len(r.text), m.end() + self._thumbnail_window)
|
||||||
|
window_html = r.text[window_start:window_end]
|
||||||
|
thumb_url: str | None = None
|
||||||
|
img_m = _IMG_SRC_RE.search(window_html)
|
||||||
|
if img_m:
|
||||||
|
thumb_url = img_m.group(1).strip()
|
||||||
|
if thumb_url.startswith("//"):
|
||||||
|
thumb_url = "https:" + thumb_url
|
||||||
|
elif thumb_url.startswith("/"):
|
||||||
|
base = urllib.parse.urlparse(url)
|
||||||
|
thumb_url = f"{base.scheme}://{base.netloc}{thumb_url}"
|
||||||
|
|
||||||
|
# Opcjonalny metadata fetch (studio/dodatkowi performerzy/tagi). Default
|
||||||
|
# zwraca None — większość tube'ów ma tylko search HTML bez metadata.
|
||||||
|
# PornHat ma `data-setup='{...}'` w `js-ajax-{dvd,model,tag}` divach.
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
extra_performers: list[RawPerformer] = []
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
try:
|
||||||
|
meta = self._fetch_scene_metadata(scene_url)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("%s metadata fetch failed for %s: %s", self.sitetag, scene_url, e)
|
||||||
|
meta = None
|
||||||
|
if meta is not None:
|
||||||
|
studio, extra_performers, tags = meta
|
||||||
|
|
||||||
|
# Performer z query zawsze obecny (driver scraping). Extra performers
|
||||||
|
# z detail page dorzucamy — dedupe po slug/name w resolverze.
|
||||||
|
all_performers = [RawPerformer(name=query.strip()), *extra_performers]
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
url=scene_url,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
thumbnail_url=thumb_url,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
performers=all_performers,
|
||||||
|
studio=studio,
|
||||||
|
tags=tags,
|
||||||
|
raw={
|
||||||
|
"source": f"direct_scraper:{self.sitetag}",
|
||||||
|
"query": query,
|
||||||
|
"page": page,
|
||||||
|
"url": scene_url,
|
||||||
|
"search_url": url,
|
||||||
|
"thumbnail_url": thumb_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
27
app/connectors/direct_scrapers/base.py
Normal file
27
app/connectors/direct_scrapers/base.py
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
"""BaseDirectTubeScraper — kontrakt dla bezpośrednich scraperów tube'ów."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import abc
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawScene
|
||||||
|
|
||||||
|
|
||||||
|
class BaseDirectTubeScraper(abc.ABC):
|
||||||
|
"""Kontrakt direct scrapera. Wszystkie scrapery feedują do `Source(name='pornapp')`
|
||||||
|
żeby dziedziczyć logikę resolvera + idempotent merge per external_id."""
|
||||||
|
|
||||||
|
sitetag: str
|
||||||
|
"""Stabilny ID tube'a — używany w external_id `f"{sitetag}:{url}"`. Zgodny
|
||||||
|
z porn-app sitetag (hqpornercom, sxylandcom, itp.)."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Search tube po query (zwykle: nazwa performera). Yield RawScene per wynik."""
|
||||||
|
raise NotImplementedError
|
||||||
18
app/connectors/direct_scrapers/eporner.py
Normal file
18
app/connectors/direct_scrapers/eporner.py
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
"""eporner.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.eporner.com/search/<q>/<page>/` (1-indexed pages).
|
||||||
|
Scene URL: `https://www.eporner.com/hd-porn/<id>/<slug>/` lub `/video-<id>/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class EpornerScraper(BaseSearchScraper):
|
||||||
|
sitetag = "epornercom"
|
||||||
|
_search_url_template = "https://www.eporner.com/search/{query}/{page}/"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/(?:hd-porn|video-[a-z0-9]+)/(?:[a-zA-Z0-9]+/)?(?P<slug>[a-zA-Z0-9_\-]+))/?"',
|
||||||
|
)
|
||||||
22
app/connectors/direct_scrapers/fpoxxx.py
Normal file
22
app/connectors/direct_scrapers/fpoxxx.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
"""fpoxxx — direct HTML scrape search results.
|
||||||
|
|
||||||
|
UWAGA: dokładna domena fpoxxx (sitetag w bazie) niekoniecznie zawiera "com" ani
|
||||||
|
"net" — porn-app DEFAULT_SITETAGS używa "fpoxxx" jako sitetag. Best-guess: fpo.xxx.
|
||||||
|
|
||||||
|
Search: `https://fpo.xxx/page/<n>/?s=<q>` (WordPress).
|
||||||
|
Scene URL: `https://fpo.xxx/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class FpoxxxScraper(BaseSearchScraper):
|
||||||
|
sitetag = "fpoxxx"
|
||||||
|
_search_url_template = "https://fpo.xxx/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://fpo\.xxx/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
177
app/connectors/direct_scrapers/freshporno.py
Normal file
177
app/connectors/direct_scrapers/freshporno.py
Normal file
|
|
@ -0,0 +1,177 @@
|
||||||
|
"""freshporno.org — latest-vids browse scraper.
|
||||||
|
|
||||||
|
Pilot #2 (po shyfap fail). Hipoteza: freshporno zachowuje oryginalne studio titles
|
||||||
|
("Straighten Her Out" zamiast custom rebranding jak shyfap) → title fuzzy match
|
||||||
|
do canonical zadziała. Bonus: channel = studio 1:1 (Pure Taboo, Brazzers, etc.).
|
||||||
|
|
||||||
|
URL patterns:
|
||||||
|
- Listing: `/` (page 1), `/2/`, `/3/`, ... (last `/391/` w czasie pisania)
|
||||||
|
- Scene: `/videos/<slug>/`
|
||||||
|
- Channels: `/channels/<slug>/` (= studio)
|
||||||
|
- Models: `/models/<slug>/` (= performer)
|
||||||
|
- Tags: `/tags/<slug>/` (= category)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime, timedelta
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
_BASE = "https://freshporno.org"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://freshporno\.org/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
|
||||||
|
_CHANNEL_LINK_RE = re.compile(
|
||||||
|
r'href="https://freshporno\.org/channels/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_MODEL_LINK_RE = re.compile(
|
||||||
|
r'href="https://freshporno\.org/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'href="https://freshporno\.org/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
# Duration via <time datetime="PT46M01S"> (ISO 8601 duration). Fallback: meta property
|
||||||
|
_TIME_DURATION_RE = re.compile(r'<time[^>]+datetime="PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_duration_to_sec(html: str) -> int | None:
|
||||||
|
m = _TIME_DURATION_RE.search(html)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
h = int(m.group(1) or 0)
|
||||||
|
mn = int(m.group(2) or 0)
|
||||||
|
s = int(m.group(3) or 0)
|
||||||
|
return h * 3600 + mn * 60 + s
|
||||||
|
|
||||||
|
|
||||||
|
class FreshpornoScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "freshpornoorg"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/"
|
||||||
|
return f"{_BASE}/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = m.group(1)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
title = meta_content(detail_html, property="og:title")
|
||||||
|
if not title:
|
||||||
|
m = re.search(r"<title>([^<]+)</title>", detail_html, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
title = m.group(1).strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
description = meta_content(detail_html, property="og:description") or meta_content(
|
||||||
|
detail_html, name="description"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Duration: <meta property="video:duration"> w sekundach LUB <time datetime="PT46M01S">
|
||||||
|
duration_sec: int | None = None
|
||||||
|
dur_meta = meta_content(detail_html, property="video:duration")
|
||||||
|
if dur_meta and dur_meta.isdigit():
|
||||||
|
duration_sec = int(dur_meta)
|
||||||
|
else:
|
||||||
|
duration_sec = _parse_iso_duration_to_sec(detail_html)
|
||||||
|
|
||||||
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
||||||
|
|
||||||
|
# Channel = studio. Pierwszy `/channels/<slug>/` link na stronie body
|
||||||
|
# (top nav też ma channels list ale to inny pattern z `/channels/" zatrzymanym)
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
# Skipnij nav linki ze stringiem "Channels" jako anchor text — bierzemy specific channel
|
||||||
|
for m in _CHANNEL_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if name.lower() in ("channels", ""):
|
||||||
|
continue
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"freshpornoorg:channel:{slug}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Performers — wszyscy `/models/<slug>/`
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if slug in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"freshpornoorg:model:{slug}",
|
||||||
|
name=name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
# Skip multi-tag composite slugs (freshporno czasem emituje URL-e
|
||||||
|
# typu /tags/face-sitting-fake-tits-freckles-girlfriend-... które
|
||||||
|
# są kombinacją tagów, nie pojedynczym tagiem). Normalne tagi mają
|
||||||
|
# <40 znaków, >60 to na pewno bug.
|
||||||
|
if len(slug) > 60:
|
||||||
|
continue
|
||||||
|
if slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(
|
||||||
|
RawTag(external_id=f"freshpornoorg:tag:{slug}", name=name, slug=slug)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phash z thumbnail. Wiemy że freshporno używa internal screenshots (preview.mp4.jpg)
|
||||||
|
# więc to też może nie matchować canonical phashy — ale test pokaże.
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
129
app/connectors/direct_scrapers/fullmovies.py
Normal file
129
app/connectors/direct_scrapers/fullmovies.py
Normal file
|
|
@ -0,0 +1,129 @@
|
||||||
|
"""fullmovies.xxx — latest-vids browse scraper.
|
||||||
|
|
||||||
|
Identyczny engine co hdporn.gg (KVS sponsor_groups stack): `/videos/<slug>/`,
|
||||||
|
`/networks/<slug>/`, `/models/<slug>/`, `/tags/<slug>/`. og:image to `img.fullmovies.xxx/...`
|
||||||
|
— **prawdopodobnie auto-screenshot** (jak hdporn.gg → 8% match). Probe potwierdzi.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
_BASE = "https://www.fullmovies.xxx"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://www\.fullmovies\.xxx/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
|
||||||
|
_NETWORK_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.fullmovies\.xxx/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_MODEL_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.fullmovies\.xxx/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.fullmovies\.xxx/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FullmoviesScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "fullmoviesxxx"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/latest-updates/"
|
||||||
|
return f"{_BASE}/latest-updates/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = m.group(1)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
title = meta_content(detail_html, property="og:title")
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
title = re.sub(r":\s*Free HD Porn\s*$|^Watch\s+|\s+Full XXX\s*$", "", title, flags=re.IGNORECASE).strip()
|
||||||
|
|
||||||
|
description = meta_content(detail_html, property="og:description")
|
||||||
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
||||||
|
|
||||||
|
duration_sec: int | None = None
|
||||||
|
dur_meta = meta_content(detail_html, property="video:duration")
|
||||||
|
if dur_meta and dur_meta.isdigit():
|
||||||
|
duration_sec = int(dur_meta)
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
for m in _NETWORK_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if name.lower() in ("networks", ""):
|
||||||
|
continue
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"fullmoviesxxx:network:{slug}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if slug in seen_perf or name.lower() in ("pornstars", "models"):
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"fullmoviesxxx:model:{slug}", name=name)
|
||||||
|
)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(RawTag(external_id=f"fullmoviesxxx:tag:{slug}", name=name, slug=slug))
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
87
app/connectors/direct_scrapers/hdporn92.py
Normal file
87
app/connectors/direct_scrapers/hdporn92.py
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
"""HDPorn92Scraper — direct HTML scrape hdporn92.com search.
|
||||||
|
|
||||||
|
Search: `https://hdporn92.com/page/<n>/?s=<query>`. Scene URL format:
|
||||||
|
`https://hdporn92.com/<slug>/` (jeden segment ścieżki). Trzeba odsiać
|
||||||
|
nawigację (`/categories/`, `/actors/`, `/feed/`, `/dmca/`, `/contact-us/`,
|
||||||
|
external links badoinkvr/etc.).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://hdporn92\.com/([a-z0-9][a-z0-9-]+))/?"')
|
||||||
|
|
||||||
|
_NAV_SLUGS = {
|
||||||
|
"actors", "categories", "tags", "feed", "dmca", "contact-us",
|
||||||
|
"comments", "wp-content", "wp-admin", "wp-includes", "wp-login.php",
|
||||||
|
"page", "?filter", "?s",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class HDPorn92Scraper(BaseDirectTubeScraper):
|
||||||
|
sitetag = "hdporn92com"
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
q = urllib.parse.quote_plus(query.strip())
|
||||||
|
url = f"https://hdporn92.com/page/{page}/?s={q}"
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=60)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("hdporn92 search fetch failed: %s", e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for m in _SCENE_URL_RE.finditer(r.text):
|
||||||
|
scene_url = m.group(1) + "/"
|
||||||
|
slug = m.group(2)
|
||||||
|
if slug in _NAV_SLUGS:
|
||||||
|
continue
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
|
||||||
|
slug_lower = slug.lower()
|
||||||
|
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = slug.replace("-", " ").strip()
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"hdporn92com:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
url=scene_url,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(origin="tube:hdporn92com", page_url=scene_url)
|
||||||
|
],
|
||||||
|
performers=[RawPerformer(name=query.strip())],
|
||||||
|
raw={
|
||||||
|
"source": "direct_scraper:hdporn92",
|
||||||
|
"query": query,
|
||||||
|
"page": page,
|
||||||
|
"url": scene_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
142
app/connectors/direct_scrapers/hdporngg.py
Normal file
142
app/connectors/direct_scrapers/hdporngg.py
Normal file
|
|
@ -0,0 +1,142 @@
|
||||||
|
"""hdporn.gg — latest-vids browse scraper.
|
||||||
|
|
||||||
|
Engine podobny do freshporno: `/videos/<slug>/` URL, `/networks/<slug>/` = studio,
|
||||||
|
`/models/<slug>/` = performer, `/tags/<slug>/` = tag.
|
||||||
|
|
||||||
|
Quirk: og:image to internal CDN `img.hdporn.gg/...` — przed merging do prod
|
||||||
|
sprawdzamy phash distance (gate-keeper: jeśli Hamming >5 dla >70% scen → orphan
|
||||||
|
factory, wyłącz; analogia do shyfap).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
_BASE = "https://www.hdporn.gg"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://www\.hdporn\.gg/videos/[a-z0-9\-]+/)"', re.IGNORECASE)
|
||||||
|
_NETWORK_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.hdporn\.gg/networks/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_MODEL_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.hdporn\.gg/models/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'href="https://www\.hdporn\.gg/tags/([a-z0-9\-]+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HDPornGGScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "hdporngg"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/latest-updates/"
|
||||||
|
return f"{_BASE}/latest-updates/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
url = m.group(1)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
title = meta_content(detail_html, property="og:title")
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
# og:title typowo zawiera ": Free HD Porn" suffix — usuń
|
||||||
|
title = re.sub(r":\s*Free HD Porn\s*$", "", title, flags=re.IGNORECASE).strip()
|
||||||
|
# I "Brazzers - " prefix często też w title — zostaw, bo studio name w title
|
||||||
|
# to silny sygnał dla fuzzy match.
|
||||||
|
|
||||||
|
description = meta_content(detail_html, property="og:description")
|
||||||
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
||||||
|
|
||||||
|
duration_sec: int | None = None
|
||||||
|
dur_meta = meta_content(detail_html, property="video:duration")
|
||||||
|
if dur_meta and dur_meta.isdigit():
|
||||||
|
duration_sec = int(dur_meta)
|
||||||
|
|
||||||
|
# Studio z /networks/. Skip nav anchors typu "Networks" / "Pornstars".
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
for m in _NETWORK_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if name.lower() in ("networks", ""):
|
||||||
|
continue
|
||||||
|
# Pierwszy NETWORK link w body to studio sceny (nav sidebar też ma networks
|
||||||
|
# listę — bierzemy gdy `class="btn_sponsor_group"` lub po prostu pierwszy
|
||||||
|
# NIE z sidebara). hdporn.gg pokazuje btn_sponsor_group w main scene area.
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"hdporngg:network:{slug}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m in _MODEL_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if slug in seen_perf or name.lower() in ("pornstars", "models"):
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(external_id=f"hdporngg:model:{slug}", name=name)
|
||||||
|
)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m in _TAG_LINK_RE.finditer(detail_html):
|
||||||
|
slug, name = m.group(1), m.group(2).strip()
|
||||||
|
if slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(
|
||||||
|
RawTag(external_id=f"hdporngg:tag:{slug}", name=name, slug=slug)
|
||||||
|
)
|
||||||
|
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
94
app/connectors/direct_scrapers/hqporner.py
Normal file
94
app/connectors/direct_scrapers/hqporner.py
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
"""HQPornerScraper — direct HTML scrape hqporner search page.
|
||||||
|
|
||||||
|
Search URL: `https://hqporner.com/?q=<query>&p=<page>`. Static HTML zwraca ~50
|
||||||
|
linków `/hdporn/<id>-<slug>.html` per strona. Tytuł deducimy ze slug'a (porn-app
|
||||||
|
data API zwraca dokładniejszy ale wymaga round-trip — dla MVP slug-derived OK,
|
||||||
|
resolver i tak je sciagnie z TPDB merge).
|
||||||
|
|
||||||
|
Search fuzzy: hqporner zwraca "Lola Noir" gdy szukamy "Noir" itp. Dlatego
|
||||||
|
filtrujemy wyniki po tym czy slug zawiera query (lub jego token) — analogicznie
|
||||||
|
jak `fetch_scenes_for_search` w pornapp connectorze.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_SCENE_HREF_RE = re.compile(r'/hdporn/(\d+)-([^"\.]+)\.html')
|
||||||
|
|
||||||
|
|
||||||
|
class HQPornerScraper(BaseDirectTubeScraper):
|
||||||
|
sitetag = "hqpornercom"
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
q = urllib.parse.quote_plus(query.strip())
|
||||||
|
url = f"https://hqporner.com/?q={q}&p={page}"
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=30)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("hqporner search fetch failed: %s", e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
log.debug("hqporner search %s status=%d", url, r.status_code)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filtr: slug musi zawierać przynajmniej jedno z słów query (case-insensitive)
|
||||||
|
# Eliminuje totalnie niezwiązane wyniki gdy fuzzy search szumi.
|
||||||
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
||||||
|
|
||||||
|
seen_urls: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for m in _SCENE_HREF_RE.finditer(r.text):
|
||||||
|
scene_id = m.group(1)
|
||||||
|
slug_part = m.group(2)
|
||||||
|
scene_url = f"https://hqporner.com/hdporn/{scene_id}-{slug_part}.html"
|
||||||
|
if scene_url in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(scene_url)
|
||||||
|
|
||||||
|
# Title-token filter
|
||||||
|
slug_lower = slug_part.lower()
|
||||||
|
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = slug_part.replace("_", " ").replace("-", " ").strip()
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"hqpornercom:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
url=scene_url,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin="tube:hqpornercom",
|
||||||
|
page_url=scene_url,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
# Wymuszamy hint performera = query — search per performer name znaczy
|
||||||
|
# że scena prawie na pewno o nim. Resolver dorobi ScenePerformer link.
|
||||||
|
performers=[RawPerformer(name=query.strip())],
|
||||||
|
raw={
|
||||||
|
"source": "direct_scraper:hqporner",
|
||||||
|
"query": query,
|
||||||
|
"page": page,
|
||||||
|
"scene_id": scene_id,
|
||||||
|
"url": scene_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
19
app/connectors/direct_scrapers/latestleaks.py
Normal file
19
app/connectors/direct_scrapers/latestleaks.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""latestleaks.co — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://latestleaks.co/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://latestleaks.co/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class LatestLeaksScraper(BaseSearchScraper):
|
||||||
|
sitetag = "latestleaksco"
|
||||||
|
_search_url_template = "https://latestleaks.co/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://latestleaks\.co/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/latestpornvideo.py
Normal file
19
app/connectors/direct_scrapers/latestpornvideo.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""latestpornvideo.com — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://latestpornvideo.com/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://latestpornvideo.com/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
|
sitetag = "latestpornvideocom"
|
||||||
|
_search_url_template = "https://latestpornvideo.com/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://latestpornvideo\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/mypornerleak.py
Normal file
19
app/connectors/direct_scrapers/mypornerleak.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""mypornerleak.com — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://mypornerleak.com/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://mypornerleak.com/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class MyPornerLeakScraper(BaseSearchScraper):
|
||||||
|
sitetag = "mypornerleakcom"
|
||||||
|
_search_url_template = "https://mypornerleak.com/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://mypornerleak\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
21
app/connectors/direct_scrapers/perverzija.py
Normal file
21
app/connectors/direct_scrapers/perverzija.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
"""perverzija.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.perverzija.com/page/<n>/?s=<q>` (WordPress + Cloudflare).
|
||||||
|
Scene URL: `https://www.perverzija.com/<slug>/`.
|
||||||
|
|
||||||
|
CF-protected: `browser_get` (curl_cffi) bypassuje JA3 fingerprint blocks.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class PerverzijaScraper(BaseSearchScraper):
|
||||||
|
sitetag = "perverzijacom"
|
||||||
|
_search_url_template = "https://www.perverzija.com/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://www\.perverzija\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
215
app/connectors/direct_scrapers/porn00.py
Normal file
215
app/connectors/direct_scrapers/porn00.py
Normal file
|
|
@ -0,0 +1,215 @@
|
||||||
|
"""porn00.org — latest-vids browse scraper.
|
||||||
|
|
||||||
|
URL patterns:
|
||||||
|
- Listing: `/latest-vids/` (page 1), `/latest-vids/2/`, ...
|
||||||
|
- Scene: `/video/<slug>/`
|
||||||
|
- Performer: `/<slug>/` (np. `/august-skye/`) — w sekcji "Pornstars:" na detail
|
||||||
|
- Categories: `/category-name/<slug>/`
|
||||||
|
|
||||||
|
Sygnały dostępne:
|
||||||
|
- Title (listing card + h1 + og:title)
|
||||||
|
- Performer(s) (z sekcji "Pornstars:" na detail page — pojedynczy slug per link)
|
||||||
|
- Categories (z sekcji "Categories:" — `/category-name/<slug>/`)
|
||||||
|
- Duration (listing card `<div class="duration">MM:SS</div>`)
|
||||||
|
- Direct mp4 (KVS engine — `video_url: 'https://www.porn00.org/get_file/.../<id>.mp4'`)
|
||||||
|
- Thumbnail (own CDN `/contents/videos_screenshots/.../1.jpg`)
|
||||||
|
|
||||||
|
BRAK:
|
||||||
|
- Studio
|
||||||
|
- Release year / data
|
||||||
|
- Description
|
||||||
|
|
||||||
|
Tytuł format: `"PerformerName - Scene Title"` (eg "August Skye - Helping Him...").
|
||||||
|
Performer name w prefixie tytułu zwykle pokrywa się z first `/pornstars/` link.
|
||||||
|
|
||||||
|
Expected pilot wynik: niski canonical match rate (~5-10%) bo brak studio/year. Direct
|
||||||
|
mp4 to bonus playback source dla scen które matchują canonical z innych źródeł.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://www.porn00.org"
|
||||||
|
|
||||||
|
# Listing card pattern (z chrome devtools snapshot 2026-05-17):
|
||||||
|
# <div class="item">
|
||||||
|
# <a href="https://www.porn00.org/video/<slug>/" title="...">
|
||||||
|
# <img class="thumb lazy-load" src="...contents/videos_screenshots/<bucket>/<id>/320x180/1.jpg" data-cnt="5">
|
||||||
|
# </a>
|
||||||
|
# <strong class="title">Title</strong>
|
||||||
|
# <div class="duration">34:34</div>
|
||||||
|
# </div>
|
||||||
|
_LISTING_CARD_RE = re.compile(
|
||||||
|
r'<div class="item\s*">'
|
||||||
|
r'.*?<a href="(?P<url>https://www\.porn00\.org/video/[^"]+/)"\s+title="(?P<title>[^"]+)"'
|
||||||
|
r'.*?<img class="thumb[^"]*"\s+src="(?P<thumb>[^"]+)"'
|
||||||
|
r'.*?<div class="duration">(?P<dur>[^<]+)</div>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Performer link pattern (porn00 konwencja): `/star-name/<slug>/`
|
||||||
|
# (analogicznie do `/category-name/`, `/tags-name/`).
|
||||||
|
_PERFORMER_LINK_RE = re.compile(
|
||||||
|
r'<a\s+href="https://www\.porn00\.org/star-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Categories: <a href="https://www.porn00.org/category-name/<slug>/">Name</a>
|
||||||
|
_CATEGORY_LINK_RE = re.compile(
|
||||||
|
r'<a\s+href="https://www\.porn00\.org/category-name/([a-z0-9\-]+)/"[^>]*>([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Direct mp4 stream z KVS flashvars: `video_url: 'https://.../43144.mp4/?v-acctoken=...'`.
|
||||||
|
# URL może mieć cokolwiek po `.mp4`: `/?v-acctoken=...`, `?q=720p`, itp. — bierzemy
|
||||||
|
# wszystko do najbliższego `'` lub `"`.
|
||||||
|
_VIDEO_URL_RE = re.compile(
|
||||||
|
r"""video_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Wariant 720p (KVS często serwuje 360p domyślnie + 720p w `video_alt_url`).
|
||||||
|
_VIDEO_ALT_URL_RE = re.compile(
|
||||||
|
r"""video_alt_url:\s*['"]([^'"]+\.mp4[^'"]*)['"]""", re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_mmss(s: str) -> int | None:
|
||||||
|
"""`34:34` → 2074, `1:20:37` → 4837."""
|
||||||
|
parts = s.strip().split(":")
|
||||||
|
try:
|
||||||
|
if len(parts) == 2:
|
||||||
|
return int(parts[0]) * 60 + int(parts[1])
|
||||||
|
if len(parts) == 3:
|
||||||
|
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class Porn00Scraper(BaseBrowseScraper):
|
||||||
|
sitetag = "porn00org"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
# Cache listing card meta — duration + thumb + title. Detail page nie ma
|
||||||
|
# tych pól w meta (brak og:duration), więc listing jest source of truth.
|
||||||
|
self._listing_cache: dict[str, dict] = {}
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/latest-vids/"
|
||||||
|
return f"{_BASE}/latest-vids/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
self._listing_cache = {}
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _LISTING_CARD_RE.finditer(listing_html):
|
||||||
|
url = m.group("url")
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
self._listing_cache[url] = {
|
||||||
|
"title": m.group("title").strip(),
|
||||||
|
"thumb": m.group("thumb"),
|
||||||
|
"duration_sec": _parse_mmss(m.group("dur") or ""),
|
||||||
|
}
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
meta = self._listing_cache.get(scene_url, {})
|
||||||
|
|
||||||
|
# Title: og:title preferowane (cleaner), fallback do listing meta.
|
||||||
|
title = meta_content(detail_html, property="og:title") or meta.get("title")
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_sec = meta.get("duration_sec")
|
||||||
|
# Thumbnail: prefer og:image z detail (full-size preview), fallback listing 320x180.
|
||||||
|
thumb = meta_content(detail_html, property="og:image") or meta.get("thumb")
|
||||||
|
|
||||||
|
# Performers — porn00 konwencja `/star-name/<slug>/` (jak `/tags-name/`,
|
||||||
|
# `/category-name/`). Wszystkie linki tego pattern to performerzy.
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for pm in _PERFORMER_LINK_RE.finditer(detail_html):
|
||||||
|
slug = pm.group(1).lower()
|
||||||
|
if slug in seen_perf or not (2 <= len(slug) <= 60):
|
||||||
|
continue
|
||||||
|
seen_perf.add(slug)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"{self.sitetag}:performer:{slug}",
|
||||||
|
name=pm.group(2).strip(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Categories → tags
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for cm in _CATEGORY_LINK_RE.finditer(detail_html):
|
||||||
|
slug = cm.group(1).lower()
|
||||||
|
if slug in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(slug)
|
||||||
|
tags.append(
|
||||||
|
RawTag(
|
||||||
|
external_id=f"{self.sitetag}:tag:{slug}",
|
||||||
|
name=cm.group(2).strip(),
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Direct mp4 z KVS flashvars — preferujemy 720p (video_alt_url) nad 360p (video_url).
|
||||||
|
stream_url: str | None = None
|
||||||
|
if (vm := _VIDEO_ALT_URL_RE.search(detail_html)):
|
||||||
|
stream_url = vm.group(1)
|
||||||
|
elif (vm := _VIDEO_URL_RE.search(detail_html)):
|
||||||
|
stream_url = vm.group(1)
|
||||||
|
|
||||||
|
# Phash — porn00 robi własne screenshoty (`/contents/videos_screenshots/`),
|
||||||
|
# więc canonical phash match raczej fail. Próbujemy mimo to.
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
stream_url=stream_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=None, # porn00 brak studio signal
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/porn4days.py
Normal file
19
app/connectors/direct_scrapers/porn4days.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""porn4days.pw — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://porn4days.pw/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://porn4days.pw/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class Porn4DaysScraper(BaseSearchScraper):
|
||||||
|
sitetag = "porn4dayspw"
|
||||||
|
_search_url_template = "https://porn4days.pw/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://porn4days\.pw/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/porndish.py
Normal file
19
app/connectors/direct_scrapers/porndish.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""porndish.com — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://porndish.com/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://porndish.com/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class PornDishScraper(BaseSearchScraper):
|
||||||
|
sitetag = "porndishcom"
|
||||||
|
_search_url_template = "https://porndish.com/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://porndish\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
26
app/connectors/direct_scrapers/pornditt.py
Normal file
26
app/connectors/direct_scrapers/pornditt.py
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
"""pornditt.com — direct HTML scrape.
|
||||||
|
|
||||||
|
KVS-style site (kt_player engine). Search URL: `/search/<slug>/?from=<page>` z slug-style
|
||||||
|
zapytaniem (spacje → `-`). Sceny renderują się na subdomenie `v.pornditt.com/videos/<id>/<slug>/`,
|
||||||
|
więc regex matchuje oba (z i bez `v.` prefix).
|
||||||
|
|
||||||
|
Sitetag `porndittcom` (legacy z porn-app DEFAULT_SITETAGS — suffix-stripped name).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class PornDittScraper(BaseSearchScraper):
|
||||||
|
sitetag = "porndittcom"
|
||||||
|
_search_url_template = "https://pornditt.com/search/{query}/?from={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://(?:v\.)?pornditt\.com/videos/(?P<sid>\d+)/(?P<slug>[a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _format_query_for_url(self, query: str) -> str:
|
||||||
|
# KVS slug: lowercase, spacja/interpunkcja → `-`. URL-encoded (`+`) tu nie zadziała.
|
||||||
|
return re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-")
|
||||||
99
app/connectors/direct_scrapers/pornhat.py
Normal file
99
app/connectors/direct_scrapers/pornhat.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
"""pornhat.com — search-mode scraper (performer-driven backfill).
|
||||||
|
|
||||||
|
KVS engine. Search URL: `/search/<query>/` z `+` jako space separator. Scene URLs
|
||||||
|
to `/video/<slug>/` (slug bez ID prefix, w przeciwieństwie do 3Movs/OK.xxx). Slug
|
||||||
|
zawiera tokens query gdy match jest relevant, więc filtruje się automatycznie.
|
||||||
|
|
||||||
|
Auto-screenshot thumbnaile (`static.pornhat.com/contents/videos_screenshots/.../1.jpg`)
|
||||||
|
— do canonical match przez phash NIE nadają się (sprawdzone w probe 2026-05-12, 8%).
|
||||||
|
Ale wartość scrapera: discovering nowych scen performera których inne tube'y/canonical
|
||||||
|
nie mają. Mostly orphan ingest, ale dla popular performers może łapać studio scenes
|
||||||
|
których nie mamy w TPDB jeszcze.
|
||||||
|
|
||||||
|
Metadata enrich: scene page ma `class="info-video js-ajax-{dvd,model,tag}"` div'y
|
||||||
|
z `data-setup='{"title": ..., "url": ..., "dir": ...}'` JSON. Parsujemy w
|
||||||
|
`_fetch_scene_metadata()` żeby insertować studio (dvd), dodatkowych performerów
|
||||||
|
(models), i tagi do każdej sceny.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawStudio, RawTag
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# `class="info-video js-ajax-<kind>"` ... `data-setup='<json>'`. JSON jest
|
||||||
|
# single-quoted (HTML attribute), z double-quotes wewnątrz dla string values.
|
||||||
|
# `\1` w replacement: backreference do `<kind>` żeby wiedzieć co matchujemy.
|
||||||
|
_AJAX_DATA_RE = re.compile(
|
||||||
|
r"class=\"info-video js-ajax-(?P<kind>dvd|model|tag)[^\"]*\"[^>]*data-setup='(?P<json>[^']+)'",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PornHatScraper(BaseSearchScraper):
|
||||||
|
sitetag = "pornhatcom"
|
||||||
|
# Pagination KVS-style: /search/<query>/<page>/ (page=1 ALSO works z explicit `/1/`)
|
||||||
|
_search_url_template = "https://www.pornhat.com/search/{query}/{page}/"
|
||||||
|
# PornHat search HTML używa relative hrefs `/video/<slug>/`. BaseSearchScraper
|
||||||
|
# automatycznie konwertuje relative → absolute via urlparse(search_url).netloc.
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>(?:https://www\.pornhat\.com)?/video/(?P<slug>[a-z0-9\-]+)/)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _format_query_for_url(self, query: str) -> str:
|
||||||
|
# KVS: lowercase + spaces → `-` (slug-style), działa też `+`
|
||||||
|
return query.strip().lower().replace(" ", "-")
|
||||||
|
|
||||||
|
def _fetch_scene_metadata(
|
||||||
|
self, scene_url: str
|
||||||
|
) -> tuple[RawStudio | None, list[RawPerformer], list[RawTag]] | None:
|
||||||
|
"""Fetch scene detail + parse `js-ajax-{dvd,model,tag}` data-setup JSON."""
|
||||||
|
try:
|
||||||
|
r = browser_get(scene_url, timeout=self._timeout)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("pornhat detail fetch failed %s: %s", scene_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
|
||||||
|
for m in _AJAX_DATA_RE.finditer(r.text):
|
||||||
|
kind = m.group("kind").lower()
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group("json"))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
name = (data.get("title") or "").strip()
|
||||||
|
slug = (data.get("dir") or "").strip() or None
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
if kind == "dvd":
|
||||||
|
# `dvd` to studio/series wrapper (np. "Adult Time"). Pierwsze
|
||||||
|
# wystąpienie bierzemy jako studio sceny — rzadko jest ich więcej.
|
||||||
|
if studio is None:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"pornhatcom:dvd:{slug or name.lower()}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
elif kind == "model":
|
||||||
|
performers.append(RawPerformer(name=name))
|
||||||
|
elif kind == "tag":
|
||||||
|
tags.append(RawTag(
|
||||||
|
external_id=f"pornhatcom:tag:{slug or name.lower()}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
))
|
||||||
|
|
||||||
|
return studio, performers, tags
|
||||||
24
app/connectors/direct_scrapers/pornhub.py
Normal file
24
app/connectors/direct_scrapers/pornhub.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
"""PornHub.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.pornhub.com/video/search?search=<q>&page=<n>`
|
||||||
|
Scene URL: `https://www.pornhub.com/view_video.php?viewkey=<id>`
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class PornHubScraper(BaseSearchScraper):
|
||||||
|
sitetag = "pornhubcom"
|
||||||
|
_search_url_template = "https://www.pornhub.com/video/search?search={query}&page={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/view_video\.php\?viewkey=[A-Za-z0-9]+)"',
|
||||||
|
)
|
||||||
|
|
||||||
|
def _slug_from_match(self, m, scene_url):
|
||||||
|
# Pornhub URL nie ma slugu — używamy viewkey jako slug do query token filtering.
|
||||||
|
# Tytuł będzie derived z viewkey (krótki ID), ale faktyczny title backfilluje
|
||||||
|
# się przy resolve (yt-dlp ma metadata).
|
||||||
|
return m.group("url").split("=")[-1]
|
||||||
33
app/connectors/direct_scrapers/porntrex.py
Normal file
33
app/connectors/direct_scrapers/porntrex.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
"""PornTrex.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.porntrex.com/search/<q>/` (single page, brak ?page=).
|
||||||
|
Scene URL: `https://www.porntrex.com/video/<id>/<slug>/`
|
||||||
|
|
||||||
|
Porntrex pagination niespójne między widokami — używamy `?from=<offset>` gdy page>1.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class PornTrexScraper(BaseSearchScraper):
|
||||||
|
sitetag = "porntrexcom"
|
||||||
|
_search_url_template = "https://www.porntrex.com/search/{query}/"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://www\.porntrex\.com/video/\d+/(?P<slug>[a-z0-9_\-]+))/?"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def search(self, query, *, page=1, limit=None):
|
||||||
|
# Porntrex używa offset w URL gdy page > 1: `/search/<q>/?from_videos=<page>`
|
||||||
|
if page > 1:
|
||||||
|
original = self._search_url_template
|
||||||
|
self._search_url_template = f"{original.rstrip('/')}/?from_videos={page}"
|
||||||
|
try:
|
||||||
|
yield from super().search(query, page=page, limit=limit)
|
||||||
|
finally:
|
||||||
|
self._search_url_template = original
|
||||||
|
else:
|
||||||
|
yield from super().search(query, page=page, limit=limit)
|
||||||
304
app/connectors/direct_scrapers/pornxp.py
Normal file
304
app/connectors/direct_scrapers/pornxp.py
Normal file
|
|
@ -0,0 +1,304 @@
|
||||||
|
"""pornxp.ph — latest-vids browse scraper.
|
||||||
|
|
||||||
|
URL patterns:
|
||||||
|
- Listing: `https://pornxp.ph/` (page 1, 72 cards) lub `?p=N` (pagination).
|
||||||
|
URL-e w listing mają randomized suffix per request (`/videos/94528971225` vs
|
||||||
|
`/videos/94528971837`) — **`data-id` (np. `94528971`) jest stable** i tego
|
||||||
|
używamy dla external_id zamiast całego URL.
|
||||||
|
- Detail: `/videos/<id_with_suffix>`.
|
||||||
|
- Tags: `/tags/<URL-encoded-name>`. Trzy kategorie wnioskowane heurystyką
|
||||||
|
z `_classify_tag` (studio vs performer vs tag).
|
||||||
|
|
||||||
|
Rich signals (perfekt dla canonical match scoring):
|
||||||
|
- Title (`<div class="item_title">` w listing card + `<h1>` na detail)
|
||||||
|
- Studio (z `<div class="tags">` pierwszy tag z `.com`/`.co` LUB CamelCase concat)
|
||||||
|
- Performers (z tags w `<div class="tags">`, Capital + space + Capital)
|
||||||
|
- Release year (regex `Released:` na detail page bodyText)
|
||||||
|
- Duration (`<div class="item_dur">MM:SS</div>` listing card)
|
||||||
|
- Direct mp4 streams (`<source src="https://sv.porn-xp.com/.../720.mp4">`) — no hoster
|
||||||
|
- Animated preview (`data-preview="//t.porn-xp.com/.../<id>.mp4"`)
|
||||||
|
|
||||||
|
Thumbnail: `<img class="item_img" src="/<id>.jpg">` — relatywny, pornxp's own CDN.
|
||||||
|
Phash hit-rate niskie ale studio+performer+title fuzzy match wystarczy do canonical.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import date
|
||||||
|
from urllib.parse import unquote, urljoin
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BASE = "https://pornxp.ph"
|
||||||
|
|
||||||
|
# Listing card — DOTALL bo HTML cards są wieloliniowe.
|
||||||
|
# Wariant 1 (eager): `<img class="item_img" src="/<id>.jpg">`
|
||||||
|
# Wariant 2 (lazy): `<img class="item_img lazy" src="/images/fluid_spinner.svg" data-src="/<id>.jpg">`
|
||||||
|
# Łapiemy obie warianty — w `_parse_listing_thumb` preferujemy `data-src` nad `src`.
|
||||||
|
_LISTING_CARD_RE = re.compile(
|
||||||
|
r'<div class="item preview"\s+data-id="(?P<id>\d+)"'
|
||||||
|
r'(?:\s+data-preview="(?P<preview>[^"]*)")?[^>]*>'
|
||||||
|
r'\s*<a href="(?P<url>/videos/\d+)"[^>]*>'
|
||||||
|
r'.*?<img class="item_img(?:\s+[\w\-]+)*"\s+(?P<img_attrs>[^>]+)>'
|
||||||
|
r'.*?<div class="item_dur">(?P<dur>[^<]+)</div>'
|
||||||
|
r'.*?<div class="item_title">(?P<title>[^<]+)</div>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_IMG_SRC_RE = re.compile(r'\bsrc="([^"]+)"', re.IGNORECASE)
|
||||||
|
_IMG_DATASRC_RE = re.compile(r'\bdata-src="([^"]+)"', re.IGNORECASE)
|
||||||
|
|
||||||
|
# Detail page — tags wrapper. Sometimes <div class="tags">, sometimes inline.
|
||||||
|
# Bierzemy do najbliższego </div> bo tagi tej sceny są w jednym divie.
|
||||||
|
_DETAIL_TAGS_BLOCK_RE = re.compile(
|
||||||
|
r'<div class="tags">(?P<inner>.*?)</div>', re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'<a\s+href="/tags/([^"]+)"[^>]*>([^<]+)</a>', re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_RELEASED_RE = re.compile(r'Released:\s*(\d{4})', re.IGNORECASE)
|
||||||
|
_H1_RE = re.compile(r'<h1[^>]*>([^<]+)</h1>', re.IGNORECASE)
|
||||||
|
# Direct mp4/m3u8 sources — preferujemy 720 nad 360. Format często protocol-relative:
|
||||||
|
# `<source src="//sv.porn-xp.com/.../720.mp4">` — normalize do `https://...` w consumerze.
|
||||||
|
_SOURCE_RE = re.compile(
|
||||||
|
r'<source\s+src="(?P<url>(?:https?:)?//[^"]+\.(?:mp4|m3u8))"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_mmss(s: str) -> int | None:
|
||||||
|
"""`16:12` → 972, `1:20:37` → 4837. None gdy format niepoprawny."""
|
||||||
|
parts = s.strip().split(":")
|
||||||
|
try:
|
||||||
|
if len(parts) == 2:
|
||||||
|
return int(parts[0]) * 60 + int(parts[1])
|
||||||
|
if len(parts) == 3:
|
||||||
|
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_tag(name: str) -> str:
|
||||||
|
"""Zwraca 'studio' | 'performer' | 'tag'.
|
||||||
|
|
||||||
|
Heurystyka oparta na sample analysis pornxp.ph tagów:
|
||||||
|
- Studio: zawiera `.` (`TheTeenBay.co`, `Clips4sale.tv`) LUB CamelCase concat
|
||||||
|
bez spacji (`LegalPorno`, `DirtyWivesClub`, `AnalMom`, `Clips4sale`)
|
||||||
|
- Performer: dokładnie 2 słowa Capital + Capital (`Alix Lynx`, `Reagan Foxx`)
|
||||||
|
- Tag/category: pozostałe — lowercase single word LUB Cap single word
|
||||||
|
(`oral`, `Lesbians`, `Incest`, `BBC`)
|
||||||
|
|
||||||
|
Edge case: single-word studio jak "Brazzers", "Vixen" → klasyfikowane jako tag.
|
||||||
|
To akceptowalne — composite score scoring tags ma niższą wagę niż studio match,
|
||||||
|
więc fallback z 1+ performer match wystarczy.
|
||||||
|
"""
|
||||||
|
name = name.strip()
|
||||||
|
if not name:
|
||||||
|
return "tag"
|
||||||
|
if "." in name:
|
||||||
|
return "studio"
|
||||||
|
if " " in name:
|
||||||
|
parts = name.split()
|
||||||
|
if len(parts) == 2 and all(p[:1].isupper() for p in parts if p):
|
||||||
|
return "performer"
|
||||||
|
return "tag"
|
||||||
|
# No spaces:
|
||||||
|
# ALL-uppercase (BBC, POV, BDSM, MILF) → tag (skróty/akronimy)
|
||||||
|
if name.isupper():
|
||||||
|
return "tag"
|
||||||
|
# CamelCase mix (LegalPorno, AnalMom, DirtyWivesClub) → studio
|
||||||
|
if any(c.isupper() for c in name[1:]):
|
||||||
|
return "studio"
|
||||||
|
return "tag"
|
||||||
|
|
||||||
|
|
||||||
|
def _slugify(name: str) -> str:
|
||||||
|
"""`Alix Lynx` → `alix-lynx`. Lowercase, spaces→hyphens, alphanum only."""
|
||||||
|
return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-")
|
||||||
|
|
||||||
|
|
||||||
|
class PornXPScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "pornxpph"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
# Cache listing card metadata per scene URL — populated w `_extract_scene_urls`,
|
||||||
|
# consumed w `_parse_detail`. Detail page sam nie ma `<div class="item_dur">`
|
||||||
|
# ani thumbnail URL, tylko h1+tags+sources. Cache reset per page (każde
|
||||||
|
# _extract_scene_urls override'uje).
|
||||||
|
self._listing_cache: dict[str, dict] = {}
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
# Page 1 = homepage. Pagination `?p=N` (sprawdzone 2026-05-17 chrome devtools).
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/"
|
||||||
|
return f"{_BASE}/?p={page}"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
"""Zwraca listę URL-i scen + cache'uje meta z listing card (duration, thumb,
|
||||||
|
title, data-id) w `self._listing_cache[url]`."""
|
||||||
|
self._listing_cache = {}
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _LISTING_CARD_RE.finditer(listing_html):
|
||||||
|
rel_url = m.group("url")
|
||||||
|
url = urljoin(_BASE, rel_url)
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
# Parse img_attrs: prefer data-src (lazy-load actual URL) nad src
|
||||||
|
# (placeholder spinner.svg dla lazy variant). Eager cards mają tylko src.
|
||||||
|
img_attrs = m.group("img_attrs") or ""
|
||||||
|
thumb = None
|
||||||
|
if (dm := _IMG_DATASRC_RE.search(img_attrs)):
|
||||||
|
thumb = dm.group(1)
|
||||||
|
elif (sm := _IMG_SRC_RE.search(img_attrs)):
|
||||||
|
src = sm.group(1)
|
||||||
|
# Skipnij placeholder spinner jeśli nie ma data-src.
|
||||||
|
if "spinner" not in src.lower():
|
||||||
|
thumb = src
|
||||||
|
if thumb and not thumb.startswith("http"):
|
||||||
|
thumb = urljoin(_BASE, thumb)
|
||||||
|
self._listing_cache[url] = {
|
||||||
|
"data_id": m.group("id"),
|
||||||
|
"preview_mp4": (
|
||||||
|
"https:" + m.group("preview")
|
||||||
|
if m.group("preview") and m.group("preview").startswith("//")
|
||||||
|
else m.group("preview")
|
||||||
|
),
|
||||||
|
"thumb": thumb,
|
||||||
|
"duration_sec": _parse_mmss(m.group("dur") or ""),
|
||||||
|
"title": m.group("title").strip(),
|
||||||
|
}
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
# Listing-card meta (preferowane — detail page nie ma duration/thumb)
|
||||||
|
meta = self._listing_cache.get(scene_url, {})
|
||||||
|
data_id = meta.get("data_id")
|
||||||
|
if not data_id:
|
||||||
|
# URL nie pasuje do listingu (random suffix mismatch po pagination redo).
|
||||||
|
# Wyciągnij data-id z URL: /videos/<id>... — pierwsze 8-10 cyfr.
|
||||||
|
id_match = re.search(r"/videos/(\d{6,12})", scene_url)
|
||||||
|
data_id = id_match.group(1) if id_match else None
|
||||||
|
|
||||||
|
# Title: prefer h1 over listing card title (detail h1 jest cleaner)
|
||||||
|
title = meta.get("title") or ""
|
||||||
|
if (m := _H1_RE.search(detail_html)):
|
||||||
|
title = m.group(1).strip() or title
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_sec = meta.get("duration_sec")
|
||||||
|
thumb = meta.get("thumb")
|
||||||
|
|
||||||
|
# Release year — `Released: 2016`. RawScene ma `release_date` (typu `date`),
|
||||||
|
# nie samo year — wpisujemy Jan 1 jako placeholder żeby resolver miał year
|
||||||
|
# signal (date proximity scoring tylko sprawdza year w composite).
|
||||||
|
release_date: date | None = None
|
||||||
|
if (m := _RELEASED_RE.search(detail_html)):
|
||||||
|
try:
|
||||||
|
year = int(m.group(1))
|
||||||
|
if 1970 <= year <= 2100:
|
||||||
|
release_date = date(year, 1, 1)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Tags: tylko block <div class="tags">...</div> tej sceny (nie related).
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_perf_slugs: set[str] = set()
|
||||||
|
seen_tag_slugs: set[str] = set()
|
||||||
|
if (block := _DETAIL_TAGS_BLOCK_RE.search(detail_html)):
|
||||||
|
for tag_m in _TAG_LINK_RE.finditer(block.group("inner")):
|
||||||
|
url_part = tag_m.group(1)
|
||||||
|
name = tag_m.group(2).strip()
|
||||||
|
# URL-encoded space → real space. Niektóre tagi mają `%20`.
|
||||||
|
decoded_name = unquote(url_part).strip()
|
||||||
|
# Display name z anchor preferowane (czasem rożni się od URL slug).
|
||||||
|
display = name or decoded_name
|
||||||
|
kind = _classify_tag(display)
|
||||||
|
slug = _slugify(display)
|
||||||
|
if not slug:
|
||||||
|
continue
|
||||||
|
ext_id = f"{self.sitetag}:{kind}:{slug}"
|
||||||
|
if kind == "studio":
|
||||||
|
if studio is None: # pierwszy studio-tag wygrywa
|
||||||
|
studio = RawStudio(external_id=ext_id, name=display, slug=slug)
|
||||||
|
elif kind == "performer":
|
||||||
|
if slug not in seen_perf_slugs:
|
||||||
|
seen_perf_slugs.add(slug)
|
||||||
|
performers.append(RawPerformer(external_id=ext_id, name=display))
|
||||||
|
else:
|
||||||
|
if slug not in seen_tag_slugs:
|
||||||
|
seen_tag_slugs.add(slug)
|
||||||
|
tags.append(RawTag(external_id=ext_id, name=display, slug=slug))
|
||||||
|
|
||||||
|
# Playback: direct mp4 streams `<source src="//sv.porn-xp.com/.../720.mp4">`.
|
||||||
|
# URL-e są protocol-relative — normalize do `https:`. Preferujemy 720 nad 360.
|
||||||
|
def _norm(u: str) -> str:
|
||||||
|
return "https:" + u if u.startswith("//") else u
|
||||||
|
|
||||||
|
stream_url: str | None = None
|
||||||
|
all_sources = [_norm(m.group("url")) for m in _SOURCE_RE.finditer(detail_html)]
|
||||||
|
if all_sources:
|
||||||
|
for u in all_sources:
|
||||||
|
if "720" in u:
|
||||||
|
stream_url = u
|
||||||
|
break
|
||||||
|
stream_url = stream_url or all_sources[0]
|
||||||
|
|
||||||
|
# Phash z thumbnail (pornxp własny CDN — expected niski match rate, ale
|
||||||
|
# try). Reseter ścieżek do canonical odbędzie się głównie przez
|
||||||
|
# studio+performer+year+title scoring.
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumb:
|
||||||
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
# Normalize page_url: pornxp homepage serwuje random URL suffix per request
|
||||||
|
# (`/videos/94528971225` vs `/videos/94528971836` ten sam scene). PlaybackSource
|
||||||
|
# unique key to `(origin, page_url)` — bez normalize generujemy 3x duplikaty
|
||||||
|
# na każdym scrape run. Canonical URL = `/videos/<data_id>`.
|
||||||
|
canonical_url = (
|
||||||
|
f"{_BASE}/videos/{data_id}" if data_id else scene_url
|
||||||
|
)
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=canonical_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
stream_url=stream_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{data_id}" if data_id else f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
22
app/connectors/direct_scrapers/redtube.py
Normal file
22
app/connectors/direct_scrapers/redtube.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
"""RedTube.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.redtube.com/?search=<q>&page=<n>`
|
||||||
|
Scene URL: `https://www.redtube.com/<id>` (slug nie ma w URL — viewkey-only).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class RedTubeScraper(BaseSearchScraper):
|
||||||
|
sitetag = "redtubecom"
|
||||||
|
_search_url_template = "https://www.redtube.com/?search={query}&page={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://www\.redtube\.com/(?P<slug>\d+))"',
|
||||||
|
)
|
||||||
|
|
||||||
|
def _title_from_slug(self, slug):
|
||||||
|
# Numeric ID jako tytuł nie ma sensu — placeholder, title backfill przy resolve.
|
||||||
|
return f"redtube:{slug}"
|
||||||
183
app/connectors/direct_scrapers/shyfap.py
Normal file
183
app/connectors/direct_scrapers/shyfap.py
Normal file
|
|
@ -0,0 +1,183 @@
|
||||||
|
"""shyfap.net — latest-vids browse scraper.
|
||||||
|
|
||||||
|
Browse-only (nie search-driven). Sitetag `shyfapnet`. Bogata metadata na detail
|
||||||
|
page'u (meta tags + body links): title, studio, performers, tags, duration,
|
||||||
|
description, upload_date, embed_url.
|
||||||
|
|
||||||
|
Pierwszy pilot scrapera browse-mode (2026-05-12) — weryfikacja czy detail-page
|
||||||
|
metadata wystarcza do canonical match >5%. Jeśli tak → rozszerzamy o porn00,
|
||||||
|
fullmovies, pornxp, freshporno, 4k69, hdporn.gg.
|
||||||
|
|
||||||
|
URL patterns:
|
||||||
|
- Listing: `/videos_1/` (page 1), `/videos_1/<n>/` (page 2+)
|
||||||
|
- Scene: `/video/<slug>_v<id>/`
|
||||||
|
- Embed: `/embed/<id>` (z og:video meta)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from app.connectors.base import RawFingerprint, RawPerformer, RawPlaybackSource, RawScene, RawStudio, RawTag
|
||||||
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
_BASE = "https://www.shyfap.net"
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(/video/[a-z0-9\-]+_v\d+/)"', re.IGNORECASE)
|
||||||
|
_STUDIO_LINK_RE = re.compile(
|
||||||
|
r'href="/studio/([a-z0-9\-]+)_s(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_PORNSTAR_LINK_RE = re.compile(
|
||||||
|
r'href="/pornstar/([a-z0-9\-]+)_p(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_TAG_LINK_RE = re.compile(
|
||||||
|
r'href="/tag/([a-z0-9\-]+)_t(\d+)/"[^>]*>([^<]+)', re.IGNORECASE
|
||||||
|
)
|
||||||
|
# /video/<slug>_v<id>/ — id z URL używamy jako stable internal ID (np. w external_id),
|
||||||
|
# nie z meta `ya:ovs:id` żeby uniknąć rozjazdu meta vs URL.
|
||||||
|
_INTERNAL_ID_RE = re.compile(r"_v(\d+)/?$", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
class ShyfapScraper(BaseBrowseScraper):
|
||||||
|
sitetag = "shyfapnet"
|
||||||
|
|
||||||
|
def _listing_url(self, page: int) -> str:
|
||||||
|
# page 1 → /videos_1/, page 2 → /videos_1/2/ (shyfap quirk — sufiks `_1`
|
||||||
|
# zawsze, dodatkowy `/N/` dla pagination)
|
||||||
|
if page <= 1:
|
||||||
|
return f"{_BASE}/videos_1/"
|
||||||
|
return f"{_BASE}/videos_1/{page}/"
|
||||||
|
|
||||||
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for m in _SCENE_URL_RE.finditer(listing_html):
|
||||||
|
rel = m.group(1)
|
||||||
|
if rel in seen:
|
||||||
|
continue
|
||||||
|
seen.add(rel)
|
||||||
|
out.append(urljoin(_BASE, rel))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
|
# Title from og:title (fallback do <title> regex)
|
||||||
|
title = meta_content(detail_html, property="og:title")
|
||||||
|
if not title:
|
||||||
|
m = re.search(r"<title>([^<|]+)(?:\s*[-|])", detail_html, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
title = m.group(1).strip()
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
description = meta_content(detail_html, property="og:description") or meta_content(
|
||||||
|
detail_html, name="description"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Duration: <meta property="video:duration" content="2436"> (seconds)
|
||||||
|
duration_sec: int | None = None
|
||||||
|
dur_str = meta_content(detail_html, property="video:duration")
|
||||||
|
if dur_str and dur_str.isdigit():
|
||||||
|
duration_sec = int(dur_str)
|
||||||
|
|
||||||
|
# Upload date: <meta property="ya:ovs:upload_date" content="2021-12-07T09:07:11+03:00">
|
||||||
|
# To upload date do shyfap, NIE prawdziwa data release sceny. Jednak lepsza niż None
|
||||||
|
# bo zwykle uploaduje się w ciągu dni od release studia → dla date_proximity w
|
||||||
|
# resolverze (window 7 dni) zwykle wystarczy do match.
|
||||||
|
release_date: date | None = None
|
||||||
|
upload_str = meta_content(detail_html, property="ya:ovs:upload_date")
|
||||||
|
if upload_str:
|
||||||
|
try:
|
||||||
|
release_date = datetime.fromisoformat(upload_str).date()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Thumbnail: og:image
|
||||||
|
thumbnail_url = meta_content(detail_html, property="og:image")
|
||||||
|
|
||||||
|
# Internal ID z URL → external_id stabilny + embed URL fallback
|
||||||
|
internal_id: str | None = None
|
||||||
|
m = _INTERNAL_ID_RE.search(scene_url)
|
||||||
|
if m:
|
||||||
|
internal_id = m.group(1)
|
||||||
|
# Embed URL: og:video (zwykle /embed/<id>)
|
||||||
|
embed_url = meta_content(detail_html, property="og:video")
|
||||||
|
if not embed_url and internal_id:
|
||||||
|
embed_url = f"{_BASE}/embed/{internal_id}"
|
||||||
|
|
||||||
|
# Studio — pierwszy `/studio/<slug>_s<id>/` link na stronie
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
m_studio = _STUDIO_LINK_RE.search(detail_html)
|
||||||
|
if m_studio:
|
||||||
|
slug, sid, name = m_studio.group(1), m_studio.group(2), m_studio.group(3).strip()
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"shyfapnet:studio:{sid}",
|
||||||
|
name=name,
|
||||||
|
slug=slug,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Performers — wszyscy `/pornstar/<slug>_p<id>/` (zwykle 1-3 per scena)
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
seen_perf: set[str] = set()
|
||||||
|
for m_p in _PORNSTAR_LINK_RE.finditer(detail_html):
|
||||||
|
slug, pid, name = m_p.group(1), m_p.group(2), m_p.group(3).strip()
|
||||||
|
if pid in seen_perf:
|
||||||
|
continue
|
||||||
|
seen_perf.add(pid)
|
||||||
|
performers.append(
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"shyfapnet:performer:{pid}",
|
||||||
|
name=name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tags — wszystkie `/tag/<slug>_t<id>/` (zwykle 10-25 per scena)
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
seen_tag: set[str] = set()
|
||||||
|
for m_t in _TAG_LINK_RE.finditer(detail_html):
|
||||||
|
slug, tid, name = m_t.group(1), m_t.group(2), m_t.group(3).strip()
|
||||||
|
if tid in seen_tag:
|
||||||
|
continue
|
||||||
|
seen_tag.add(tid)
|
||||||
|
tags.append(
|
||||||
|
RawTag(external_id=f"shyfapnet:tag:{tid}", name=name, slug=slug)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Playback source — embed_url (mobile WebView fallback). Stream extraction
|
||||||
|
# przez app/extractors/__init__.py wymaga osobnego registry entry — dla
|
||||||
|
# pilot scrapera zostawiamy embed-only (WebView), direct mp4 to follow-up.
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
embed_url=embed_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
thumbnail_url=thumbnail_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Perceptual hash z thumbnail. Resolver Path 3 (find_by_phash_within,
|
||||||
|
# Hamming ≤5) auto-merguje gdy TPDB/StashDB ma fingerprint tej samej sceny.
|
||||||
|
# Niezależne od shyfap title-rebrandingu — bierze się z frame'u sceny.
|
||||||
|
fingerprints: list[RawFingerprint] = []
|
||||||
|
if thumbnail_url:
|
||||||
|
ph = compute_thumbnail_phash(thumbnail_url, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/siska.py
Normal file
19
app/connectors/direct_scrapers/siska.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""siska.video — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://siska.video/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://siska.video/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class SiskaScraper(BaseSearchScraper):
|
||||||
|
sitetag = "siskavideo"
|
||||||
|
_search_url_template = "https://siska.video/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://siska\.video/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
78
app/connectors/direct_scrapers/sxyland.py
Normal file
78
app/connectors/direct_scrapers/sxyland.py
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
"""SxyLandScraper — direct HTML scrape sxyland.com search.
|
||||||
|
|
||||||
|
Search: `https://sxyland.com/?s=<query>` zwraca wyniki w formacie
|
||||||
|
`https://sxyland.com/<numeric_id>/<slug>/`. Filtrujemy linki bez numeric ID
|
||||||
|
(legal pages typu /18-u-s-c-2257/).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_SCENE_URL_RE = re.compile(r'href="(https://sxyland\.com/(\d+)/([^"/]+))/?"')
|
||||||
|
|
||||||
|
|
||||||
|
class SxyLandScraper(BaseDirectTubeScraper):
|
||||||
|
sitetag = "sxylandcom"
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
q = urllib.parse.quote_plus(query.strip())
|
||||||
|
url = f"https://sxyland.com/page/{page}/?s={q}"
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=30)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("sxyland search fetch failed: %s", e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for m in _SCENE_URL_RE.finditer(r.text):
|
||||||
|
scene_url = m.group(1) + "/"
|
||||||
|
slug = m.group(3)
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
|
||||||
|
slug_lower = slug.lower()
|
||||||
|
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = slug.replace("-", " ").strip()
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"sxylandcom:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
url=scene_url,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(origin="tube:sxylandcom", page_url=scene_url)
|
||||||
|
],
|
||||||
|
performers=[RawPerformer(name=query.strip())],
|
||||||
|
raw={
|
||||||
|
"source": "direct_scraper:sxyland",
|
||||||
|
"query": query,
|
||||||
|
"page": page,
|
||||||
|
"url": scene_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
24
app/connectors/direct_scrapers/sxyprn.py
Normal file
24
app/connectors/direct_scrapers/sxyprn.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
"""sxyprn.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Sxyprn search jest oparte na `?type=videos&query=<q>` GET endpoint który zwraca
|
||||||
|
HTML strony z linkami. Scene URL format: `https://sxyprn.com/post/<post_id>.html`.
|
||||||
|
|
||||||
|
Page'owanie sxyprn niespójne — często single-page results dla query (~24 wyników).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class SxyPrnScraper(BaseSearchScraper):
|
||||||
|
sitetag = "sxyprncom"
|
||||||
|
_search_url_template = "https://sxyprn.com/?type=videos&query={query}&page={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/post/(?P<slug>[a-z0-9]+))\.html"',
|
||||||
|
)
|
||||||
|
|
||||||
|
def _title_from_slug(self, slug: str) -> str:
|
||||||
|
# sxyprn post ID to nieczytelny hash — placeholder, title backfill przy resolve.
|
||||||
|
return f"sxyprn:{slug}"
|
||||||
19
app/connectors/direct_scrapers/watchporn.py
Normal file
19
app/connectors/direct_scrapers/watchporn.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""watchporn.to — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: `https://watchporn.to/page/<n>/?s=<q>` (WordPress).
|
||||||
|
Scene URL: `https://watchporn.to/videos/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class WatchPornScraper(BaseSearchScraper):
|
||||||
|
sitetag = "watchporn"
|
||||||
|
_search_url_template = "https://watchporn.to/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://watchporn\.to/videos/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/xhamster.py
Normal file
19
app/connectors/direct_scrapers/xhamster.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""XHamster.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://xhamster.com/search/<q>?page=<n>`
|
||||||
|
Scene URL: `https://xhamster.com/videos/<slug>-<id>`
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class XHamsterScraper(BaseSearchScraper):
|
||||||
|
sitetag = "xhamstercom"
|
||||||
|
_search_url_template = "https://xhamster.com/search/{query}?page={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://xhamster\.com/videos/(?P<slug>[a-z0-9_\-]+))"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
19
app/connectors/direct_scrapers/xmoviesforyou.py
Normal file
19
app/connectors/direct_scrapers/xmoviesforyou.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
"""xmoviesforyou.com — direct HTML scrape.
|
||||||
|
|
||||||
|
Search: WordPress `?s=<q>` (lub `/page/<n>/?s=<q>` dla pagination).
|
||||||
|
Scene URL: `https://xmoviesforyou.com/<slug>/` (single segment).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class XMoviesForYouScraper(BaseSearchScraper):
|
||||||
|
sitetag = "xmoviesforyoucom"
|
||||||
|
_search_url_template = "https://xmoviesforyou.com/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://xmoviesforyou\.com/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
28
app/connectors/direct_scrapers/xnxx.py
Normal file
28
app/connectors/direct_scrapers/xnxx.py
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
"""XNXX.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.xnxx.com/search/<q>/<page-1>` (xnxx 0-indexed)
|
||||||
|
Scene URL: `https://www.xnxx.com/video-<id>/<slug>`
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class XnxxScraper(BaseSearchScraper):
|
||||||
|
sitetag = "xnxxcom"
|
||||||
|
# `/<page-1>` — handle override in search() by replacing {page}.
|
||||||
|
_search_url_template = "https://www.xnxx.com/search/{query}/{page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/video-[a-z0-9]+/(?P<slug>[a-z0-9_\-]+))"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def search(self, query, *, page=1, limit=None):
|
||||||
|
original = self._search_url_template
|
||||||
|
self._search_url_template = original.replace("{page}", str(page - 1))
|
||||||
|
try:
|
||||||
|
yield from super().search(query, page=page, limit=limit)
|
||||||
|
finally:
|
||||||
|
self._search_url_template = original
|
||||||
33
app/connectors/direct_scrapers/xvideos.py
Normal file
33
app/connectors/direct_scrapers/xvideos.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
"""XVideos.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.xvideos.com/?k=<q>&p=<page-1>` (xvideos używa 0-indexed pages)
|
||||||
|
Scene URL: `https://www.xvideos.com/video<digits>/<slug>`
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class XVideosScraper(BaseSearchScraper):
|
||||||
|
sitetag = "xvideoscom"
|
||||||
|
# 0-indexed page — w base classie computed jako `page=N`, więc override _build_url.
|
||||||
|
_search_url_template = "https://www.xvideos.com/?k={query}&p={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/video[a-z0-9.\-]+/(?P<slug>[a-z0-9_\-]+))"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def search(self, query, *, page=1, limit=None):
|
||||||
|
# XVideos używa 0-indexed pages — `page=1` w API → `&p=0` w URL.
|
||||||
|
# Override żeby base class fetch'nął zewnętrzny URL z (page-1).
|
||||||
|
# Najprościej: dostosujmy URL w override przed wywołaniem super().search().
|
||||||
|
# Ale super() używa self._search_url_template — robimy clone z poprawionym page.
|
||||||
|
original = self._search_url_template
|
||||||
|
self._search_url_template = original.replace("{page}", str(page - 1))
|
||||||
|
try:
|
||||||
|
yield from super().search(query, page=page, limit=limit)
|
||||||
|
finally:
|
||||||
|
self._search_url_template = original
|
||||||
21
app/connectors/direct_scrapers/xxxfreewatch.py
Normal file
21
app/connectors/direct_scrapers/xxxfreewatch.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
"""xxxfree.watch — direct HTML scrape.
|
||||||
|
|
||||||
|
Domain: `xxxfree.watch` (sitetag `xxxfreewatch` is legacy from porn-app DEFAULT_SITETAGS).
|
||||||
|
|
||||||
|
Search: `https://xxxfree.watch/page/<n>/?s=<q>`.
|
||||||
|
Scene URL: `https://xxxfree.watch/<slug>/`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class XxxFreeWatchScraper(BaseSearchScraper):
|
||||||
|
sitetag = "xxxfreewatch"
|
||||||
|
_search_url_template = "https://xxxfree.watch/page/{page}/?s={query}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>https://xxxfree\.watch/(?P<slug>[a-z0-9][a-z0-9\-]+))/"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
22
app/connectors/direct_scrapers/youporn.py
Normal file
22
app/connectors/direct_scrapers/youporn.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
"""YouPorn.com — direct HTML scrape search results.
|
||||||
|
|
||||||
|
Search: `https://www.youporn.com/search/?query=<q>&page=<n>`
|
||||||
|
Scene URL: `https://www.youporn.com/watch/<id>/<slug>/`
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
||||||
|
|
||||||
|
|
||||||
|
class YouPornScraper(BaseSearchScraper):
|
||||||
|
sitetag = "youporncom"
|
||||||
|
_search_url_template = "https://www.youporn.com/search/?query={query}&page={page}"
|
||||||
|
_scene_url_re = re.compile(
|
||||||
|
r'href="(?P<url>/watch/(?P<id>\d+)/(?P<slug>[a-z0-9_\-]+))/?"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _slug_from_match(self, m, scene_url):
|
||||||
|
return m.group("slug")
|
||||||
119
app/connectors/direct_scrapers/zerodayxx.py
Normal file
119
app/connectors/direct_scrapers/zerodayxx.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
"""ZeroDayXXScraper — direct HTML scrape 0dayxx.com search.
|
||||||
|
|
||||||
|
Search: `https://0dayxx.com/page/<n>/?s=<query>`. Scene URL format:
|
||||||
|
`https://0dayxx.com/0day-porn-video/<slug>/` (lub czasem `/<category>/<slug>/`).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from app.connectors.base import RawPerformer, RawPlaybackSource, RawScene
|
||||||
|
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_SCENE_URL_RE = re.compile(
|
||||||
|
r'href="(https://0dayxx\.com/(?:0day-porn-video|latest-porn-videos|porn-(?:bf|videos))/([^"/]+))/?"'
|
||||||
|
)
|
||||||
|
_OG_TITLE_RE = re.compile(
|
||||||
|
r'<meta\s+property="og:title"\s+content="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_OG_IMAGE_RE = re.compile(
|
||||||
|
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_detail(scene_url: str) -> tuple[str | None, str | None]:
|
||||||
|
"""Pobiera 0dayxx detail page i wyciąga (real_title, thumbnail_url).
|
||||||
|
|
||||||
|
0dayxx jest wrapperem (embeduje watchporn.to/inne), więc duration/tagi tu
|
||||||
|
nie są — siedzą na watchporn.to. og:image jednak jest na 0dayxx i daje
|
||||||
|
miniaturkę z poprawnym wymiarem (200x200 — mała, ale lepsza niż żadna).
|
||||||
|
|
||||||
|
Bez tego fetch'u sceny 0dayxx trafiały do dedupu z slug'iem jako title +
|
||||||
|
bez thumbnail_url — czyli z dwoma najsłabszymi sygnałami na raz, co
|
||||||
|
powodowało albo brak match'y albo false-positive merge'y (zgłoszone
|
||||||
|
2026-05-09).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
r = browser_get(scene_url, timeout=20)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("0dayxx detail fetch failed for %s: %s", scene_url, e)
|
||||||
|
return None, None
|
||||||
|
if r.status_code != 200:
|
||||||
|
return None, None
|
||||||
|
title = None
|
||||||
|
thumb = None
|
||||||
|
if (m := _OG_TITLE_RE.search(r.text)):
|
||||||
|
# Strip ` | 0dayxx.com Daily...` suffix (powtórki og:title czasem mają go).
|
||||||
|
title = m.group(1).split("|")[0].strip()
|
||||||
|
if (m := _OG_IMAGE_RE.search(r.text)):
|
||||||
|
thumb = m.group(1).strip()
|
||||||
|
return title, thumb
|
||||||
|
|
||||||
|
|
||||||
|
class ZeroDayXXScraper(BaseDirectTubeScraper):
|
||||||
|
sitetag = "0dayxxcom"
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
page: int = 1,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
q = urllib.parse.quote_plus(query.strip())
|
||||||
|
url = f"https://0dayxx.com/page/{page}/?s={q}"
|
||||||
|
try:
|
||||||
|
r = browser_get(url, timeout=30)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("0dayxx search fetch failed: %s", e)
|
||||||
|
return
|
||||||
|
if r.status_code != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
query_tokens = {tok for tok in query.lower().split() if len(tok) >= 3}
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
yielded = 0
|
||||||
|
for m in _SCENE_URL_RE.finditer(r.text):
|
||||||
|
scene_url = m.group(1) + "/"
|
||||||
|
slug = m.group(2)
|
||||||
|
if scene_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(scene_url)
|
||||||
|
|
||||||
|
slug_lower = slug.lower()
|
||||||
|
if query_tokens and not any(tok in slug_lower for tok in query_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
real_title, thumb = _fetch_detail(scene_url)
|
||||||
|
title = real_title or slug.replace("-", " ").strip()
|
||||||
|
|
||||||
|
yield RawScene(
|
||||||
|
external_id=f"0dayxxcom:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
url=scene_url,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin="tube:0dayxxcom",
|
||||||
|
page_url=scene_url,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
performers=[RawPerformer(name=query.strip())],
|
||||||
|
raw={
|
||||||
|
"source": "direct_scraper:0dayxx",
|
||||||
|
"query": query,
|
||||||
|
"page": page,
|
||||||
|
"url": scene_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yielded += 1
|
||||||
|
if limit is not None and yielded >= limit:
|
||||||
|
return
|
||||||
466
app/connectors/dooplay.py
Normal file
466
app/connectors/dooplay.py
Normal file
|
|
@ -0,0 +1,466 @@
|
||||||
|
"""dooplay (a.k.a. PsyPlay) WordPress theme scraper — generic dla mangoporn/streamporn/pandamovies.
|
||||||
|
|
||||||
|
Te 3 strony to dokładnie ten sam template (theme=dooplay + PsyPlay player plugin),
|
||||||
|
więc parametryzujemy connector po `(base_url, source_name)` i odpalamy 3 instancje.
|
||||||
|
|
||||||
|
Listing: `/movies/page/N/` zwraca <a href="/movies/<slug>/"> per item.
|
||||||
|
Detail: `/movies/<slug>/` ma rich meta:
|
||||||
|
- <h1> tytuł (w class="data" wrapper)
|
||||||
|
- <a href="/year/YYYY/" rel="tag"> rok produkcji
|
||||||
|
- <a href="/studios/<slug>/" rel="tag"> studio
|
||||||
|
- <span class='duration'>NN mins.</span> długość
|
||||||
|
- <a href="/pornstar/<slug>/"> cast (multi)
|
||||||
|
- <a href="/genre/<slug>/"> tagi (multi)
|
||||||
|
- <div itemprop="description"><p>...</p></div> opis
|
||||||
|
- <span class="dt_rating_vgs" itemprop="ratingValue">N</span> rating 0-10
|
||||||
|
- <li ... data-fl-source="<embed_url>"><a href="<embed_link>">Host</a></li> player options
|
||||||
|
|
||||||
|
Player ma multi-host options (DoodStream, LuluStream, RPMShare etc.) — każdy embed
|
||||||
|
URL idzie jako osobny `playback_source` z origin=`{site}:{host}` żeby później mobile
|
||||||
|
mógł wybrać czyim embedem chce odpalić scenę.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import date, datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
BaseMovieConnector,
|
||||||
|
RawMovie,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.extractors import browser_get
|
||||||
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---- selektory (theme-agnostic — działa dla dowolnego dooplay) -----------
|
||||||
|
|
||||||
|
# Listing item — dwa wzorce w zależności od witryny:
|
||||||
|
# 1. mangoporn: zwykłe <a href="https://site/movies/<slug>/"> bez klasy
|
||||||
|
# (theme wyrendurował SEO-friendly URL bezpośrednio w grid)
|
||||||
|
# 2. streamporn/pandamovies: <a class="ml-mask jt" href="<base>/<slug>/">
|
||||||
|
# (slug bez /movies/ prefix, np. /watch-xxx-...-adult-movie-online-free/)
|
||||||
|
# Łapiemy oba przez alternatywę.
|
||||||
|
_LIST_ITEM_RE = re.compile(
|
||||||
|
r'<a\s+href="(?P<url>https?://[^"]+)"[^>]*\bclass="ml-mask\b[^"]*"'
|
||||||
|
r"|"
|
||||||
|
r'<a\s+href="(?P<url2>https?://[^"]+/movies/[a-z0-9-]+/)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Tolerantny title — mangoporn (dooplay) używa <h1> w class="data", streamporn/pandamovies
|
||||||
|
# (raw PsyPlay theme) używają <h3 itemprop="name">. Łapiemy oba przez itemprop="name".
|
||||||
|
_TITLE_RE = re.compile(
|
||||||
|
r'<h[1-6][^>]*\sitemprop="name"[^>]*>([^<]+)</h[1-6]>'
|
||||||
|
r'|class="data"[^>]*>\s*<h[1-6][^>]*>([^<]+)</h[1-6]>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
# dooplay uses /year/, raw PsyPlay uses /release-year/. Same dla pozostałych slugów —
|
||||||
|
# różne thema dziedziczą podstawowy markup ale customizują URL słowniki.
|
||||||
|
_YEAR_RE = re.compile(
|
||||||
|
r'/(?:year|release-year)/(\d{4})/"\s*rel="tag"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_STUDIO_RE = re.compile(
|
||||||
|
r'href="https?://[^/]+/(?:studios?|director)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Duration: span class='duration' (dooplay) lub <p><strong>Duration:</strong> X hrs. Y mins.</p> (PsyPlay)
|
||||||
|
_DURATION_SPAN_RE = re.compile(
|
||||||
|
r"<span\s+class=['\"]duration['\"][^>]*>([^<]+)</span>", re.IGNORECASE
|
||||||
|
)
|
||||||
|
_DURATION_TEXT_RE = re.compile(
|
||||||
|
r"<strong>\s*Duration:\s*</strong>\s*([^<]+)<", re.IGNORECASE
|
||||||
|
)
|
||||||
|
# Release date: span class='release_date' (dooplay) lub <p><strong>Released Date:</strong> X</p> (PsyPlay)
|
||||||
|
_RELEASE_DATE_SPAN_RE = re.compile(
|
||||||
|
r"<span\s+class=['\"]release_date['\"]'?[^>]*>([^<]+)</span>", re.IGNORECASE
|
||||||
|
)
|
||||||
|
_RELEASE_DATE_TEXT_RE = re.compile(
|
||||||
|
r"<strong>\s*Released?\s*Date:\s*</strong>\s*([^<]+)<", re.IGNORECASE
|
||||||
|
)
|
||||||
|
_DESCRIPTION_RE = re.compile(
|
||||||
|
r'itemprop="description"[^>]*>(.*?)</div>', re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
_RATING_RE = re.compile(
|
||||||
|
r'itemprop="ratingValue"[^>]*>([\d.]+)</span>', re.IGNORECASE
|
||||||
|
)
|
||||||
|
# Cast: dooplay /pornstar/, PsyPlay /actor/
|
||||||
|
_PORNSTAR_RE = re.compile(
|
||||||
|
r'href="https?://[^/]+/(?:pornstar|actor)/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Genre: same /genre(s)/ w obu themach
|
||||||
|
_GENRE_TAG_RE = re.compile(
|
||||||
|
r'href="https?://[^/]+/genres?/([a-z0-9-]+)/"\s+rel="tag"[^>]*>([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Player options: data-fl-source jest oryginalnym embed URL hostera, data-fl-url
|
||||||
|
# to page URL u hostera. Stare theme (mangoporn): `<li class="hosts-buttons-wpx">`.
|
||||||
|
# Nowe theme (pandamovies od ~2026-04): `<div class="Rtable1-cell" data-fl-url=...
|
||||||
|
# data-fl-source=...>`. Trzeba też tolerować order-independent attrs — nowe theme
|
||||||
|
# emituje url BEFORE source, stare odwrotnie. Łapiemy oba wzorce dwoma osobnymi
|
||||||
|
# regexami i konsolidujemy w `_iter_player_options`.
|
||||||
|
_PLAYER_OPTION_RE = re.compile(
|
||||||
|
r'<li[^>]*\bclass="hosts-buttons-wpx"[^>]*'
|
||||||
|
r'(?:data-fl-source="(?P<source>[^"]*)"[^>]*)?'
|
||||||
|
r'(?:data-fl-url="(?P<page>[^"]*)"[^>]*)?'
|
||||||
|
r'>\s*<a[^>]*href="(?P<href>[^"]+)"[^>]*'
|
||||||
|
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
# Nowy markup pandamovies: `<div class="Rtable1-cell" data-fl-* ...><a href=...>HostName</a></div>`.
|
||||||
|
# Attrs są w kolejności url→source, source często pusty (`data-fl-source=""` dla
|
||||||
|
# doodstream/mixdrop/easyvidplayer). Capturujemy CAŁY opening tag w group(1)
|
||||||
|
# żeby data-fl-source należał gwarantowanie do TEGO konkretnego div (wcześniejszy
|
||||||
|
# window-lookback 600 chars mógł pickować poprzedni cell — cross-attribution
|
||||||
|
# doodstream→mixdrop entry, code-review #14).
|
||||||
|
_PLAYER_OPTION_DIV_RE = re.compile(
|
||||||
|
r'(<div[^>]*\bclass="Rtable1-cell"[^>]*>)\s*'
|
||||||
|
r'<a[^>]*href="(?P<href>[^"]+)"[^>]*'
|
||||||
|
r'(?:[^<]*<img[^>]+>)?\s*([^<]+?)\s*</a>',
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_DATA_FL_SOURCE_RE = re.compile(r'data-fl-source="([^"]*)"', re.IGNORECASE)
|
||||||
|
# Poster — JSON-LD `thumbnailUrl` jest najbardziej stabilny (każdy dooplay/PsyPlay
|
||||||
|
# theme z SEO ma JSON-LD VideoObject schema). Fallback na class="poster" img dla starych
|
||||||
|
# instalacji bez schema. Trzeci fallback: og:image meta tag.
|
||||||
|
_POSTER_JSONLD_RE = re.compile(
|
||||||
|
r'"thumbnailUrl"\s*:\s*"([^"]+\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_POSTER_RE = re.compile(
|
||||||
|
r'class="poster"[^>]*>\s*<img\s+[^>]*src="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_POSTER_OG_RE = re.compile(
|
||||||
|
r'<meta\s+property="og:image"\s+content="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_DURATION_MINS_RE = re.compile(r"(\d+)\s*min", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
class DooplayConnector(BaseMovieConnector):
|
||||||
|
"""Generic dooplay scraper. Instantiated per-site via subclasses below."""
|
||||||
|
|
||||||
|
kind = SourceKind.scraper
|
||||||
|
base_url: str
|
||||||
|
name: str
|
||||||
|
|
||||||
|
def __init__(self, *, timeout: float = 30.0):
|
||||||
|
if not getattr(self, "base_url", None):
|
||||||
|
raise RuntimeError(f"{type(self).__name__} requires class-level `base_url`")
|
||||||
|
if not getattr(self, "name", None):
|
||||||
|
raise RuntimeError(f"{type(self).__name__} requires class-level `name`")
|
||||||
|
self._timeout = timeout
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _fetch(self, url: str) -> str:
|
||||||
|
"""browser_get z chrome120 impersonation — psyplay sites czasem blokują
|
||||||
|
czysty httpx (Python TLS fingerprint) zwracając 500/403. curl_cffi fixuje to."""
|
||||||
|
if not url.startswith("http"):
|
||||||
|
url = self.base_url.rstrip("/") + url
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT,
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
"Referer": self.base_url + "/",
|
||||||
|
}
|
||||||
|
r = browser_get(url, headers=headers, timeout=self._timeout, follow_redirects=True)
|
||||||
|
if r.status_code >= 400:
|
||||||
|
raise httpx.HTTPStatusError(
|
||||||
|
f"{r.status_code} for {url}",
|
||||||
|
request=None, # type: ignore[arg-type]
|
||||||
|
response=httpx.Response(r.status_code, text=r.text[:200]),
|
||||||
|
)
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
def fetch_movies(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawMovie]:
|
||||||
|
seen = 0
|
||||||
|
page = 1
|
||||||
|
seen_urls: set[str] = set()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
urls = list(self._fetch_listing(page))
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
log.warning("%s listing page=%d failed: %s", self.name, page, e)
|
||||||
|
return
|
||||||
|
if not urls:
|
||||||
|
log.info("%s: empty page=%d, stop", self.name, page)
|
||||||
|
return
|
||||||
|
for url in urls:
|
||||||
|
if url in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(url)
|
||||||
|
try:
|
||||||
|
movie = self._fetch_detail(url)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
log.warning("%s detail %s failed: %s", self.name, url, e)
|
||||||
|
continue
|
||||||
|
if movie is None:
|
||||||
|
continue
|
||||||
|
yield movie
|
||||||
|
seen += 1
|
||||||
|
if limit is not None and seen >= limit:
|
||||||
|
return
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
def _fetch_listing(self, page: int) -> Iterator[str]:
|
||||||
|
path = self._listing_path(page)
|
||||||
|
text = self._fetch(path)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
site_host = urlparse(self.base_url).hostname
|
||||||
|
for m in _LIST_ITEM_RE.finditer(text):
|
||||||
|
url = m.group("url") or m.group("url2")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if urlparse(url).hostname != site_host:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
yield url
|
||||||
|
|
||||||
|
def _listing_path(self, page: int) -> str:
|
||||||
|
return "/movies/" if page == 1 else f"/movies/page/{page}/"
|
||||||
|
|
||||||
|
def _fetch_detail(self, url: str) -> RawMovie | None:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
path = urlparse(url).path.rstrip("/")
|
||||||
|
slug = path.split("/")[-1] or "root"
|
||||||
|
text = self._fetch(url)
|
||||||
|
return _parse_dooplay_detail(
|
||||||
|
slug=slug, page_url=url, html=text,
|
||||||
|
source_name=self.name, base_url=self.base_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_dooplay_detail(
|
||||||
|
*, slug: str, html: str, source_name: str, base_url: str, page_url: str | None = None
|
||||||
|
) -> RawMovie | None:
|
||||||
|
m_title = _TITLE_RE.search(html)
|
||||||
|
if not m_title:
|
||||||
|
log.warning("%s: no title in %s", source_name, slug)
|
||||||
|
return None
|
||||||
|
title = _decode_html((m_title.group(1) or m_title.group(2)).strip())
|
||||||
|
|
||||||
|
m_year = _YEAR_RE.search(html)
|
||||||
|
release_year = int(m_year.group(1)) if m_year else None
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
m_studio = _STUDIO_RE.search(html)
|
||||||
|
if m_studio:
|
||||||
|
studio_slug = m_studio.group(1)
|
||||||
|
studio_name = _decode_html(m_studio.group(2).strip())
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"{source_name}:{studio_slug}",
|
||||||
|
name=studio_name,
|
||||||
|
slug=studio_slug,
|
||||||
|
)
|
||||||
|
|
||||||
|
duration_sec: int | None = None
|
||||||
|
m_dur = _DURATION_SPAN_RE.search(html) or _DURATION_TEXT_RE.search(html)
|
||||||
|
if m_dur:
|
||||||
|
text = m_dur.group(1)
|
||||||
|
# Może być "32 mins." (dooplay) albo "1 hrs. 12 mins." (PsyPlay)
|
||||||
|
m_h = re.search(r"(\d+)\s*hr", text, re.IGNORECASE)
|
||||||
|
m_m = re.search(r"(\d+)\s*min", text, re.IGNORECASE)
|
||||||
|
if m_h or m_m:
|
||||||
|
duration_sec = (int(m_h.group(1)) * 3600 if m_h else 0) + (int(m_m.group(1)) * 60 if m_m else 0)
|
||||||
|
|
||||||
|
release_date: date | None = None
|
||||||
|
m_rd = _RELEASE_DATE_SPAN_RE.search(html) or _RELEASE_DATE_TEXT_RE.search(html)
|
||||||
|
if m_rd:
|
||||||
|
text = m_rd.group(1).strip()
|
||||||
|
for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
|
||||||
|
try:
|
||||||
|
release_date = datetime.strptime(text, fmt).date()
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
description: str | None = None
|
||||||
|
m_desc = _DESCRIPTION_RE.search(html)
|
||||||
|
if m_desc:
|
||||||
|
description = _decode_html(_strip_tags(m_desc.group(1))).strip() or None
|
||||||
|
|
||||||
|
rating: float | None = None
|
||||||
|
m_rating = _RATING_RE.search(html)
|
||||||
|
if m_rating:
|
||||||
|
try:
|
||||||
|
rating = float(m_rating.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
poster_url: str | None = None
|
||||||
|
for rgx in (_POSTER_JSONLD_RE, _POSTER_RE, _POSTER_OG_RE):
|
||||||
|
m = rgx.search(html)
|
||||||
|
if m:
|
||||||
|
candidate = m.group(1).strip()
|
||||||
|
if candidate and "blank.gif" not in candidate and "no-poster" not in candidate:
|
||||||
|
poster_url = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
# Performers — tylko sekcja "Pornstars" ma /pornstar/<slug>/ linki, dooplay
|
||||||
|
# filtruje cast w tej sekcji. Jaccard może łapać dubel ale dedup robimy w
|
||||||
|
# resolverze (po performer_id).
|
||||||
|
performers = [
|
||||||
|
RawPerformer(
|
||||||
|
external_id=f"{source_name}:{m.group(1)}",
|
||||||
|
name=_decode_html(m.group(2).strip()),
|
||||||
|
)
|
||||||
|
for m in _PORNSTAR_RE.finditer(html)
|
||||||
|
]
|
||||||
|
|
||||||
|
tags = [
|
||||||
|
RawTag(
|
||||||
|
external_id=f"{source_name}:{m.group(1)}",
|
||||||
|
name=_decode_html(m.group(2).strip()),
|
||||||
|
slug=m.group(1),
|
||||||
|
)
|
||||||
|
for m in _GENRE_TAG_RE.finditer(html)
|
||||||
|
]
|
||||||
|
|
||||||
|
if page_url is None:
|
||||||
|
page_url = f"{base_url}/movies/{slug}/"
|
||||||
|
|
||||||
|
# Playback sources: każdy host (Doodstream/Lulu/RPM/...) jako osobny entry.
|
||||||
|
# Dedup po href żeby ten sam host nie wpadł 2x. Raw landing page (origin=
|
||||||
|
# source_name, bez :host) appendujemy TYLKO gdy nie ma żadnych sub-hosters —
|
||||||
|
# inaczej myli usera (otwiera WebView z reklamami zamiast video; bug-report
|
||||||
|
# 2026-05-16: "mangoporn przekierowuje do strony, reklama full screen").
|
||||||
|
playback_sources: list[RawPlaybackSource] = []
|
||||||
|
seen_hrefs: set[str] = set()
|
||||||
|
|
||||||
|
# Hostery file-download (non-streamable) + malware. Mobile player nie potrafi
|
||||||
|
# ich odtworzyć — rapidgator/nitroflare/frdl serwują .zip/.rar/.mp4 do download
|
||||||
|
# (premium login required), streamtape ma malware drive-by .reg. Skipujemy
|
||||||
|
# przy ingest żeby nie zaśmiecać UI martwym contentem (bug-report 2026-05-18).
|
||||||
|
SKIP_HOSTERS = {"rapidgator", "nitroflare", "nitro", "frdl", "streamtape"}
|
||||||
|
|
||||||
|
def _emit_host_entry(href: str, source: str | None) -> None:
|
||||||
|
href = href.strip()
|
||||||
|
if not href or href in seen_hrefs:
|
||||||
|
return
|
||||||
|
seen_hrefs.add(href)
|
||||||
|
try:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
host = urlparse(href).hostname or "unknown"
|
||||||
|
host_short = host.split(".")[-2] if host.count(".") >= 1 else host
|
||||||
|
except Exception:
|
||||||
|
host_short = "unknown"
|
||||||
|
if host_short.lower() in SKIP_HOSTERS:
|
||||||
|
return
|
||||||
|
playback_sources.append(
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"{source_name}:{host_short}",
|
||||||
|
page_url=href,
|
||||||
|
embed_url=source or href,
|
||||||
|
thumbnail_url=poster_url,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stary `<li class="hosts-buttons-wpx">` markup (mangoporn).
|
||||||
|
for m in _PLAYER_OPTION_RE.finditer(html):
|
||||||
|
_emit_host_entry(m.group("href") or "", (m.group("source") or "").strip() or None)
|
||||||
|
|
||||||
|
# Nowy `<div class="Rtable1-cell">` markup (pandamovies od ~2026-04 + nowe
|
||||||
|
# streamporn instances). data-fl-source jest opcjonalny — capturujemy CAŁY
|
||||||
|
# opening tag w group(1), data-fl-source extract z TEGO tagu (nie z window
|
||||||
|
# lookback po HTMLu, bo to mogło pickować poprzedni cell).
|
||||||
|
for m in _PLAYER_OPTION_DIV_RE.finditer(html):
|
||||||
|
href = m.group("href") or ""
|
||||||
|
opening_tag = m.group(1)
|
||||||
|
src_match = _DATA_FL_SOURCE_RE.search(opening_tag)
|
||||||
|
source = (src_match.group(1).strip() if src_match else "") or None
|
||||||
|
_emit_host_entry(href, source)
|
||||||
|
|
||||||
|
if not playback_sources:
|
||||||
|
# Brak sub-hosters znalezionych — fallback do landing page (mobile otworzy
|
||||||
|
# w WebView). Robimy to TYLKO gdy nie ma alternatyw, inaczej landing jest
|
||||||
|
# niepotrzebnym ad-pageiem.
|
||||||
|
playback_sources.append(
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=source_name,
|
||||||
|
page_url=page_url,
|
||||||
|
thumbnail_url=poster_url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return RawMovie(
|
||||||
|
external_id=slug,
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
release_year=release_year,
|
||||||
|
release_date=release_date,
|
||||||
|
duration_sec=duration_sec,
|
||||||
|
rating=rating,
|
||||||
|
poster_url=poster_url,
|
||||||
|
url=page_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
raw={"slug": slug, "html_len": len(html)},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---- per-site instances ----------------------------------------------------
|
||||||
|
|
||||||
|
class StreampornConnector(DooplayConnector):
|
||||||
|
name = "streamporn"
|
||||||
|
base_url = "https://streamporn.nl"
|
||||||
|
|
||||||
|
|
||||||
|
class PandamoviesConnector(DooplayConnector):
|
||||||
|
name = "pandamovies"
|
||||||
|
base_url = "https://pandamovies.pw"
|
||||||
|
|
||||||
|
|
||||||
|
class MangopornConnector(DooplayConnector):
|
||||||
|
name = "mangoporn"
|
||||||
|
base_url = "https://mangoporn.net"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers (zduplikowane z paradisehill.py — celowo, żeby connectory były niezależne)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_TAG_RE = re.compile(r"<[^>]+>")
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_tags(s: str) -> str:
|
||||||
|
return _TAG_RE.sub("", s)
|
||||||
|
|
||||||
|
|
||||||
|
_HTML_ENTITIES = {
|
||||||
|
"&": "&", "<": "<", ">": ">", """: '"', "'": "'",
|
||||||
|
"'": "'", " ": " ", "’": "'", "‘": "'",
|
||||||
|
"”": '"', "“": '"', "…": "...", "—": "—", "–": "–",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_html(s: str) -> str:
|
||||||
|
for k, v in _HTML_ENTITIES.items():
|
||||||
|
s = s.replace(k, v)
|
||||||
|
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
|
||||||
|
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
|
||||||
|
return s
|
||||||
325
app/connectors/paradisehill.py
Normal file
325
app/connectors/paradisehill.py
Normal file
|
|
@ -0,0 +1,325 @@
|
||||||
|
"""Paradisehill connector — primary source dla movies (full-length adult films).
|
||||||
|
|
||||||
|
Site notes:
|
||||||
|
- Age-gate: wymagany cookie `is18=1` (POST /is18/ zwraca 400 z curla, ale samo dorzucenie
|
||||||
|
cookie do GET-a działa — site jest tolerancyjny).
|
||||||
|
- Listing: `/all/?sort=created_at&page=N` — paginacja po 28 filmów, mikro-data Schema.org Movie.
|
||||||
|
- Detail: `/<hex_id>/` — pełne meta + Video.js playlist (chaptery jako "Part 1/2/3").
|
||||||
|
|
||||||
|
Co ekstraktujemy:
|
||||||
|
- Schema.org microdata: name, description, director, datePublished (upload), image, thumbnailUrl
|
||||||
|
- Studio: link `/studio/<id>/{name}` (tylko link dostarcza nazwę i external_id)
|
||||||
|
- Genres: ze Schema.org `itemprop="genre"` (pierwszy = movie's main genre)
|
||||||
|
- Year: parsowany z description gdy obecny ("This 1999 film..."), bo `datePublished` to upload_date
|
||||||
|
- Chapters: liczba `<li>...Part N</li>` w playliście Video.js
|
||||||
|
- Playback: na MVP `page_url` only — Video.js playlist URL jest dynamicznie ładowany przez JS
|
||||||
|
i wymaga login session. Mobile może otworzyć page w WebView (degradacja lepsza niż brak).
|
||||||
|
|
||||||
|
External_id: hex slug z URL-a (np. `259448f6b75ee` z `/259448f6b75ee/`).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import UTC, date, datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.connectors.base import (
|
||||||
|
BaseMovieConnector,
|
||||||
|
RawMovie,
|
||||||
|
RawMovieChapter,
|
||||||
|
RawPerformer,
|
||||||
|
RawPlaybackSource,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BASE_URL = "https://paradisehill.cc"
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
LISTING_PATH = "/all/" # ?sort=created_at&page=N
|
||||||
|
SOURCE_NAME = "paradisehill"
|
||||||
|
|
||||||
|
|
||||||
|
# Microdata extraction — Schema.org tagi są stabilne i niezagubione przy lekkich
|
||||||
|
# zmianach themu (yii2 widget renderuje je inwariantnie).
|
||||||
|
_TITLE_RE = re.compile(
|
||||||
|
r'<h1\s+class="title-inside"\s+itemprop="name">([^<]+)</h1>', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_DIRECTOR_RE = re.compile(r'itemprop="director">([^<]+)</', re.IGNORECASE)
|
||||||
|
_DESCRIPTION_RE = re.compile(
|
||||||
|
r'itemprop="description">([^<]+(?:<[^>]+>[^<]+)*)</span>', re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
_DATE_PUBLISHED_RE = re.compile(
|
||||||
|
r'itemprop="datePublished"\s+content="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_POSTER_RE = re.compile(
|
||||||
|
r'<img\s+itemprop="image"\s+src="(/images/[^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_THUMBNAIL_RE = re.compile(
|
||||||
|
r'<img\s+itemprop="thumbnailUrl"\s+src="(/images/[^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_STUDIO_LINK_RE = re.compile(r'<a\s+href="/studio/(\d+)/"[^>]*>([^<]+)</a>', re.IGNORECASE)
|
||||||
|
_CHAPTER_RE = re.compile(
|
||||||
|
r'<a\s+href="#"\s+class="js-list-item"\s+data-index="(\d+)">([^<]+)</a>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Listing page item:
|
||||||
|
_LIST_ITEM_RE = re.compile(
|
||||||
|
r'<div\s+class="item\s+list-film-item"[^>]*>\s*'
|
||||||
|
r'<a\s+href="/([0-9a-f]+)/"[^>]*>',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Year w description: szukamy 4-cyfrowego roku w sensownym zakresie
|
||||||
|
_YEAR_IN_DESC_RE = re.compile(r"\b(19[5-9]\d|20[0-3]\d)\b")
|
||||||
|
# Year w tytule (np. "Title (1999)")
|
||||||
|
_YEAR_IN_TITLE_RE = re.compile(r"\((\d{4})\)")
|
||||||
|
|
||||||
|
|
||||||
|
class ParadisehillConnector(BaseMovieConnector):
|
||||||
|
kind = SourceKind.scraper
|
||||||
|
name = SOURCE_NAME
|
||||||
|
|
||||||
|
def __init__(self, *, timeout: float = 30.0):
|
||||||
|
self._client = httpx.Client(
|
||||||
|
base_url=BASE_URL,
|
||||||
|
timeout=timeout,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={
|
||||||
|
"User-Agent": USER_AGENT,
|
||||||
|
# Wszystkie requesty wymagają is18 cookie. Pre-set żeby ominąć age-gate.
|
||||||
|
"Cookie": "is18=1",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._client.close()
|
||||||
|
|
||||||
|
def fetch_movies(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawMovie]:
|
||||||
|
"""Crawluje listing `/all/?sort=created_at` chronologicznie (najnowsze first).
|
||||||
|
|
||||||
|
`since`: stop gdy datePublished < since. `limit`: stop po N filmach.
|
||||||
|
Aktualnie 28 movies/page; site rośnie ~5/dzień, więc pełen crawl to ~tysiące
|
||||||
|
stron — w prod używamy `since` żeby zobaczyć tylko delta od poprzedniego runa.
|
||||||
|
"""
|
||||||
|
seen = 0
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
ids = list(self._fetch_listing_page(page))
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
log.warning("paradisehill listing page=%d failed: %s", page, e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if not ids:
|
||||||
|
log.info("paradisehill: empty listing page=%d, stop", page)
|
||||||
|
return
|
||||||
|
|
||||||
|
for mid in ids:
|
||||||
|
try:
|
||||||
|
movie = self._fetch_detail(mid)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
log.warning("paradisehill detail %s failed: %s", mid, e)
|
||||||
|
continue
|
||||||
|
if movie is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# `since` filter — datePublished poniżej threshold = stop crawla,
|
||||||
|
# bo listing jest chronologiczny. since z `_last_successful_finished_at`
|
||||||
|
# jest TZ-aware (UTC); combine() daje naive — przywróć UTC tzinfo żeby
|
||||||
|
# porównanie nie crashowało.
|
||||||
|
if since is not None and movie.release_date is not None:
|
||||||
|
rd_dt = datetime.combine(
|
||||||
|
movie.release_date, datetime.min.time(), tzinfo=UTC
|
||||||
|
)
|
||||||
|
if rd_dt < since:
|
||||||
|
log.info(
|
||||||
|
"paradisehill: hit since boundary at %s (%s), stop",
|
||||||
|
mid, movie.release_date,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
yield movie
|
||||||
|
seen += 1
|
||||||
|
if limit is not None and seen >= limit:
|
||||||
|
return
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
def _fetch_listing_page(self, page: int) -> Iterator[str]:
|
||||||
|
"""Yielduje hex IDs filmów na danej stronie."""
|
||||||
|
url = f"{LISTING_PATH}?sort=created_at&page={page}"
|
||||||
|
r = self._client.get(url)
|
||||||
|
r.raise_for_status()
|
||||||
|
for m in _LIST_ITEM_RE.finditer(r.text):
|
||||||
|
yield m.group(1)
|
||||||
|
|
||||||
|
def _fetch_detail(self, hex_id: str) -> RawMovie | None:
|
||||||
|
url = f"/{hex_id}/"
|
||||||
|
r = self._client.get(url)
|
||||||
|
r.raise_for_status()
|
||||||
|
return _parse_detail(hex_id, r.text)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_detail(hex_id: str, html: str) -> RawMovie | None:
|
||||||
|
"""Parsuje detail HTML → RawMovie. Zwraca None gdy brak title (skopany template)."""
|
||||||
|
m_title = _TITLE_RE.search(html)
|
||||||
|
if not m_title:
|
||||||
|
log.warning("paradisehill: no title in detail %s", hex_id)
|
||||||
|
return None
|
||||||
|
title = _decode_html(m_title.group(1).strip())
|
||||||
|
|
||||||
|
m_director = _DIRECTOR_RE.search(html)
|
||||||
|
director = _decode_html(m_director.group(1).strip()) if m_director else None
|
||||||
|
if director and director.lower() in ("unknown", "n/a", "-"):
|
||||||
|
director = None
|
||||||
|
|
||||||
|
m_desc = _DESCRIPTION_RE.search(html)
|
||||||
|
description = _decode_html(_strip_tags(m_desc.group(1)).strip()) if m_desc else None
|
||||||
|
|
||||||
|
release_date: date | None = None
|
||||||
|
m_date = _DATE_PUBLISHED_RE.search(html)
|
||||||
|
if m_date:
|
||||||
|
try:
|
||||||
|
release_date = datetime.fromisoformat(m_date.group(1)).date()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Year — najpierw z tytułu, potem z opisu. datePublished to upload date paradisehill
|
||||||
|
# (np. 2026-05) a nie production year (np. 1999) — useless dla year filtering.
|
||||||
|
release_year: int | None = None
|
||||||
|
m_yt = _YEAR_IN_TITLE_RE.search(title)
|
||||||
|
if m_yt:
|
||||||
|
release_year = int(m_yt.group(1))
|
||||||
|
elif description:
|
||||||
|
m_yd = _YEAR_IN_DESC_RE.search(description)
|
||||||
|
if m_yd:
|
||||||
|
release_year = int(m_yd.group(1))
|
||||||
|
|
||||||
|
poster_url: str | None = None
|
||||||
|
m_poster = _POSTER_RE.search(html)
|
||||||
|
if m_poster:
|
||||||
|
poster_url = BASE_URL + m_poster.group(1)
|
||||||
|
backdrop_url: str | None = None
|
||||||
|
m_thumb = _THUMBNAIL_RE.search(html)
|
||||||
|
if m_thumb:
|
||||||
|
backdrop_url = BASE_URL + m_thumb.group(1)
|
||||||
|
|
||||||
|
studio: RawStudio | None = None
|
||||||
|
m_studio = _STUDIO_LINK_RE.search(html)
|
||||||
|
if m_studio:
|
||||||
|
studio = RawStudio(
|
||||||
|
external_id=f"paradisehill:{m_studio.group(1)}",
|
||||||
|
name=_decode_html(m_studio.group(2).strip()),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Genre — pierwszy itemprop="genre" w samym block-inside (nie w recommendations).
|
||||||
|
# Recommended films też mają itemprop="genre" więc match limity do block-inside.
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
block_match = re.search(
|
||||||
|
r'<div\s+class="block-inside"[^>]*itemtype="http://schema\.org/Movie"[^>]*>'
|
||||||
|
r'(.*?)</div>\s*</div>\s*<div\s+class="similar',
|
||||||
|
html,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
block = block_match.group(1) if block_match else html[:8000]
|
||||||
|
for m_genre in re.finditer(r'itemprop="genre"[^>]*>([^<]+)</', block, re.IGNORECASE):
|
||||||
|
name = _decode_html(m_genre.group(1).strip())
|
||||||
|
if name and len(tags) < 10:
|
||||||
|
tags.append(RawTag(name=name, slug=_slugify(name)))
|
||||||
|
|
||||||
|
chapters: list[RawMovieChapter] = []
|
||||||
|
for m_ch in _CHAPTER_RE.finditer(html):
|
||||||
|
chapters.append(
|
||||||
|
RawMovieChapter(
|
||||||
|
chapter_index=int(m_ch.group(1)),
|
||||||
|
title=_decode_html(m_ch.group(2).strip()),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
page_url = f"{BASE_URL}/{hex_id}/"
|
||||||
|
playback_sources = [
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=SOURCE_NAME,
|
||||||
|
page_url=page_url,
|
||||||
|
thumbnail_url=poster_url,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return RawMovie(
|
||||||
|
external_id=hex_id,
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
release_year=release_year,
|
||||||
|
release_date=release_date,
|
||||||
|
director=director,
|
||||||
|
poster_url=poster_url,
|
||||||
|
backdrop_url=backdrop_url,
|
||||||
|
url=page_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=[], # Paradisehill rzadko ma cast linki — uzupełnimy przez mirrory.
|
||||||
|
tags=tags,
|
||||||
|
chapters=chapters,
|
||||||
|
playback_sources=playback_sources,
|
||||||
|
raw={"hex_id": hex_id, "html_len": len(html)},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_TAG_RE = re.compile(r"<[^>]+>")
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_tags(s: str) -> str:
|
||||||
|
return _TAG_RE.sub("", s)
|
||||||
|
|
||||||
|
|
||||||
|
_HTML_ENTITIES = {
|
||||||
|
"&": "&",
|
||||||
|
"<": "<",
|
||||||
|
">": ">",
|
||||||
|
""": '"',
|
||||||
|
"'": "'",
|
||||||
|
"'": "'",
|
||||||
|
" ": " ",
|
||||||
|
"’": "'",
|
||||||
|
"‘": "'",
|
||||||
|
"”": '"',
|
||||||
|
"“": '"',
|
||||||
|
"…": "...",
|
||||||
|
"—": "—",
|
||||||
|
"–": "–",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_html(s: str) -> str:
|
||||||
|
for k, v in _HTML_ENTITIES.items():
|
||||||
|
s = s.replace(k, v)
|
||||||
|
# Numeric entities
|
||||||
|
s = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), s)
|
||||||
|
s = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
||||||
|
|
||||||
|
|
||||||
|
def _slugify(s: str) -> str:
|
||||||
|
return _SLUG_RE.sub("-", s.lower()).strip("-") or "tag"
|
||||||
405
app/connectors/stashdb.py
Normal file
405
app/connectors/stashdb.py
Normal file
|
|
@ -0,0 +1,405 @@
|
||||||
|
"""StashDB GraphQL connector.
|
||||||
|
|
||||||
|
Endpoint: https://stashdb.org/graphql (auth: header `ApiKey: <key>`)
|
||||||
|
|
||||||
|
Query używamy `queryScenes(input: {sort, direction, page, per_page})`. StashDB nie udostępnia
|
||||||
|
typowego date-since filtra w SceneQueryInput, więc deltę robimy klient-side: sortujemy po
|
||||||
|
UPDATED_AT DESC i przerywamy gdy `updated < since`.
|
||||||
|
|
||||||
|
Schema fields kluczowe (wg https://github.com/stashapp/stash-box/blob/master/graphql/schema/schema.graphql):
|
||||||
|
Scene { id title details date duration director code urls{url site{name}}
|
||||||
|
studio{id name parent{id name}}
|
||||||
|
performers{ as performer{ id name aliases gender birthdate{date} country } }
|
||||||
|
tags{ id name }
|
||||||
|
fingerprints{ hash algorithm duration } }
|
||||||
|
|
||||||
|
Cross-reference do TPDB: `urls[].site.name` zwykle zawiera "ThePornDB" + URL z UUID
|
||||||
|
(format: https://theporndb.net/scenes/<uuid>). Wyciągamy ten UUID jako tpdb cross-ref;
|
||||||
|
ingest_orchestrator może go potem użyć do path 2 (cross-source UUID).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import UTC, date, datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
retry_if_exception_type,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
)
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.connectors.base import (
|
||||||
|
BaseConnector,
|
||||||
|
RawFingerprint,
|
||||||
|
RawPerformer,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SCENES_QUERY = """
|
||||||
|
query QScenes($input: SceneQueryInput!) {
|
||||||
|
queryScenes(input: $input) {
|
||||||
|
count
|
||||||
|
scenes {
|
||||||
|
id
|
||||||
|
title
|
||||||
|
details
|
||||||
|
release_date
|
||||||
|
date
|
||||||
|
duration
|
||||||
|
director
|
||||||
|
code
|
||||||
|
updated
|
||||||
|
urls { url site { name } }
|
||||||
|
studio {
|
||||||
|
id name
|
||||||
|
parent { id name }
|
||||||
|
}
|
||||||
|
performers {
|
||||||
|
as
|
||||||
|
performer {
|
||||||
|
id
|
||||||
|
name
|
||||||
|
aliases
|
||||||
|
gender
|
||||||
|
birthdate { date }
|
||||||
|
country
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tags { id name }
|
||||||
|
fingerprints { hash algorithm duration }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# UUID v4-ish pattern (relaxed)
|
||||||
|
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
class StashDBConnector(BaseConnector):
|
||||||
|
kind = SourceKind.stashdb
|
||||||
|
name = "stashdb"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
api_key: str | None = None,
|
||||||
|
url: str | None = None,
|
||||||
|
per_page: int = 100,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> None:
|
||||||
|
settings = get_settings()
|
||||||
|
self.api_key = api_key or settings.stashdb_api_key
|
||||||
|
if not self.api_key:
|
||||||
|
raise RuntimeError("STASHDB_API_KEY is not set")
|
||||||
|
self.url = url or settings.stashdb_graphql_url
|
||||||
|
self.per_page = per_page
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def _client(self) -> httpx.Client:
|
||||||
|
return httpx.Client(
|
||||||
|
headers={
|
||||||
|
"ApiKey": self.api_key,
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"User-Agent": "goon/0.1",
|
||||||
|
},
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
retry=retry_if_exception_type((httpx.TransportError, httpx.HTTPStatusError)),
|
||||||
|
wait=wait_exponential(multiplier=1, min=2, max=30),
|
||||||
|
stop=stop_after_attempt(5),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
def _post(self, client: httpx.Client, payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
resp = client.post(self.url, json=payload)
|
||||||
|
if resp.status_code == 429:
|
||||||
|
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
|
||||||
|
resp.raise_for_status()
|
||||||
|
body = resp.json()
|
||||||
|
if errors := body.get("errors"):
|
||||||
|
raise RuntimeError(f"stashdb graphql errors: {errors}")
|
||||||
|
return body["data"]
|
||||||
|
|
||||||
|
def fetch_scenes(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
yield from self._paginate(
|
||||||
|
extra_input={"sort": "UPDATED_AT", "direction": "DESC"},
|
||||||
|
since=since,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
def find_performer_id_by_name(self, name: str) -> str | None:
|
||||||
|
"""queryPerformers(input: {name: <name>}) → pierwszy result.
|
||||||
|
|
||||||
|
StashDB GraphQL `name` to filter substring (case-insensitive). Zwracamy id
|
||||||
|
performera o exact match (case-insensitive) jeśli jest, inaczej pierwszy z listy.
|
||||||
|
"""
|
||||||
|
query = (
|
||||||
|
"query QPerformers($input: PerformerQueryInput!) {"
|
||||||
|
" queryPerformers(input: $input) { performers { id name } }"
|
||||||
|
"}"
|
||||||
|
)
|
||||||
|
variables = {"input": {"name": name, "per_page": 5}}
|
||||||
|
with self._client() as client:
|
||||||
|
try:
|
||||||
|
data = self._post(client, {"query": query, "variables": variables})
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("stashdb queryPerformers name=%s failed: %s", name, e)
|
||||||
|
return None
|
||||||
|
performers = (data.get("queryPerformers") or {}).get("performers") or []
|
||||||
|
if not performers:
|
||||||
|
return None
|
||||||
|
target = name.strip().lower()
|
||||||
|
for p in performers:
|
||||||
|
if (p.get("name") or "").strip().lower() == target:
|
||||||
|
return p.get("id")
|
||||||
|
return performers[0].get("id")
|
||||||
|
|
||||||
|
def fetch_scenes_for_performer(
|
||||||
|
self,
|
||||||
|
performer_external_id: str,
|
||||||
|
*,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Wszystkie sceny StashDB dla performera o podanym kanonicznym UUID.
|
||||||
|
|
||||||
|
StashDB SceneQueryInput.performers = MultiIDCriterionInput { value, modifier }.
|
||||||
|
Modifier INCLUDES = scena ma WSZYSTKIE wymienione UUID-y; przy 1 UUID = po prostu
|
||||||
|
sceny tego performera.
|
||||||
|
"""
|
||||||
|
yield from self._paginate(
|
||||||
|
extra_input={
|
||||||
|
"performers": {
|
||||||
|
"value": [performer_external_id],
|
||||||
|
"modifier": "INCLUDES",
|
||||||
|
},
|
||||||
|
"sort": "DATE",
|
||||||
|
"direction": "DESC",
|
||||||
|
},
|
||||||
|
since=None, # przy performer-scoped pull bierzemy całą historię
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fetch_scenes_for_studio(
|
||||||
|
self,
|
||||||
|
studio_external_id: str,
|
||||||
|
*,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Wszystkie sceny StashDB dla studio o podanym kanonicznym UUID.
|
||||||
|
|
||||||
|
Analogiczne do fetch_scenes_for_performer ale `studios` zamiast `performers`.
|
||||||
|
StashDB SceneQueryInput.studios = MultiIDCriterionInput { value, modifier }.
|
||||||
|
"""
|
||||||
|
yield from self._paginate(
|
||||||
|
extra_input={
|
||||||
|
"studios": {
|
||||||
|
"value": [studio_external_id],
|
||||||
|
"modifier": "INCLUDES",
|
||||||
|
},
|
||||||
|
"sort": "DATE",
|
||||||
|
"direction": "DESC",
|
||||||
|
},
|
||||||
|
since=None,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _paginate(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
extra_input: dict[str, Any],
|
||||||
|
since: datetime | None,
|
||||||
|
limit: int | None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
emitted = 0
|
||||||
|
page = 1
|
||||||
|
with self._client() as client:
|
||||||
|
while True:
|
||||||
|
variables = {
|
||||||
|
"input": {
|
||||||
|
"page": page,
|
||||||
|
"per_page": self.per_page,
|
||||||
|
**extra_input,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = self._post(client, {"query": SCENES_QUERY, "variables": variables})
|
||||||
|
payload = data.get("queryScenes") or {}
|
||||||
|
scenes = payload.get("scenes") or []
|
||||||
|
if not scenes:
|
||||||
|
return
|
||||||
|
|
||||||
|
for raw in scenes:
|
||||||
|
if since is not None and _updated_before(raw, since):
|
||||||
|
return
|
||||||
|
parsed = _parse_scene(raw)
|
||||||
|
if parsed is None:
|
||||||
|
continue
|
||||||
|
yield parsed
|
||||||
|
emitted += 1
|
||||||
|
if limit is not None and emitted >= limit:
|
||||||
|
return
|
||||||
|
|
||||||
|
if len(scenes) < self.per_page:
|
||||||
|
return
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
|
||||||
|
def _updated_before(raw: dict[str, Any], since: datetime) -> bool:
|
||||||
|
upd = raw.get("updated")
|
||||||
|
if not upd:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
ts = datetime.fromisoformat(upd.replace("Z", "+00:00"))
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
if ts.tzinfo is None:
|
||||||
|
ts = ts.replace(tzinfo=UTC)
|
||||||
|
return ts < since
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date(value: Any) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
if isinstance(value, date):
|
||||||
|
return value
|
||||||
|
text = str(value).strip()
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(text[:10])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
parent = raw.get("parent") or {}
|
||||||
|
return RawStudio(
|
||||||
|
external_id=raw.get("id"),
|
||||||
|
name=raw.get("name") or "Unknown",
|
||||||
|
slug=None,
|
||||||
|
parent_external_id=parent.get("id"),
|
||||||
|
parent_name=parent.get("name"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
|
||||||
|
perf = raw.get("performer") or {}
|
||||||
|
name = perf.get("name")
|
||||||
|
if not name:
|
||||||
|
return None
|
||||||
|
aliases = perf.get("aliases") or []
|
||||||
|
if isinstance(aliases, str):
|
||||||
|
aliases = [a.strip() for a in aliases.split(",") if a.strip()]
|
||||||
|
bd_obj = perf.get("birthdate") or {}
|
||||||
|
bd = bd_obj.get("date") if isinstance(bd_obj, dict) else None
|
||||||
|
return RawPerformer(
|
||||||
|
external_id=perf.get("id"),
|
||||||
|
name=name,
|
||||||
|
aliases=[a for a in aliases if isinstance(a, str)],
|
||||||
|
gender=(perf.get("gender") or "").lower() or None,
|
||||||
|
birth_date=_parse_date(bd),
|
||||||
|
country=perf.get("country"),
|
||||||
|
as_alias_in_scene=raw.get("as") if raw.get("as") and raw.get("as") != name else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
|
||||||
|
name = raw.get("name")
|
||||||
|
if not name:
|
||||||
|
return None
|
||||||
|
return RawTag(external_id=raw.get("id"), name=name, slug=None)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_fingerprint(raw: dict[str, Any]) -> RawFingerprint | None:
|
||||||
|
h = raw.get("hash")
|
||||||
|
algo = (raw.get("algorithm") or "").lower()
|
||||||
|
if not h or algo not in {"phash", "oshash", "md5"}:
|
||||||
|
return None
|
||||||
|
return RawFingerprint(kind=algo, value=h)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_cross_refs(urls: list[dict[str, Any]] | None) -> dict[str, str]:
|
||||||
|
"""Z `scene.urls` wyciąga znane cross-source ID-ki, np. tpdb_id.
|
||||||
|
|
||||||
|
Returns: dict[source_name, external_id]. Source name ma być stabilne
|
||||||
|
(lower, np. 'tpdb' / 'theporndb').
|
||||||
|
"""
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
for u in urls or []:
|
||||||
|
url = u.get("url") or ""
|
||||||
|
site_name = ((u.get("site") or {}).get("name") or "").strip().lower()
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
# ThePornDB: .../scenes/<uuid>
|
||||||
|
if "theporndb" in site_name or "porndb" in url.lower():
|
||||||
|
m = _UUID_RE.search(url)
|
||||||
|
if m:
|
||||||
|
out["tpdb"] = m.group(0)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
|
||||||
|
external_id = raw.get("id")
|
||||||
|
title = raw.get("title")
|
||||||
|
if not external_id or not title:
|
||||||
|
log.warning("stashdb scene without id/title — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
performers = []
|
||||||
|
for p in raw.get("performers") or []:
|
||||||
|
parsed = _parse_performer(p)
|
||||||
|
if parsed is not None:
|
||||||
|
performers.append(parsed)
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
for t in raw.get("tags") or []:
|
||||||
|
parsed_t = _parse_tag(t)
|
||||||
|
if parsed_t is not None:
|
||||||
|
tags.append(parsed_t)
|
||||||
|
|
||||||
|
fingerprints = []
|
||||||
|
for fp in raw.get("fingerprints") or []:
|
||||||
|
parsed_fp = _parse_fingerprint(fp)
|
||||||
|
if parsed_fp is not None:
|
||||||
|
fingerprints.append(parsed_fp)
|
||||||
|
|
||||||
|
cross_refs = _extract_cross_refs(raw.get("urls"))
|
||||||
|
rel = _parse_date(raw.get("release_date") or raw.get("date"))
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=str(external_id),
|
||||||
|
title=title,
|
||||||
|
description=raw.get("details"),
|
||||||
|
release_date=rel,
|
||||||
|
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
|
||||||
|
code=raw.get("code"),
|
||||||
|
director=raw.get("director"),
|
||||||
|
url=None,
|
||||||
|
studio=_parse_studio(raw.get("studio")),
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
cross_source_refs=cross_refs,
|
||||||
|
raw=raw,
|
||||||
|
)
|
||||||
329
app/connectors/tpdb.py
Normal file
329
app/connectors/tpdb.py
Normal file
|
|
@ -0,0 +1,329 @@
|
||||||
|
"""ThePornDB REST connector.
|
||||||
|
|
||||||
|
API: https://api.theporndb.net (auth: Bearer token)
|
||||||
|
Lista scen: GET /scenes?per_page=200&page=N&date={YYYY-MM-DD} (delta filter)
|
||||||
|
Format: {data: [...], meta: {current_page, last_page, per_page, total}}
|
||||||
|
|
||||||
|
Sceny TPDB zwracają już rozwiniętych performerów (`performers[]`), studio (`site`) i tagi (`tags[]`).
|
||||||
|
W związku z tym pojedyncze GET /scenes wystarcza do MVP — nie musimy uderzać oddzielnie po performera.
|
||||||
|
|
||||||
|
Format performera w scenie:
|
||||||
|
- performer.id — ID przypisania performer↔scene (NIE używać do dedup)
|
||||||
|
- performer.name — imię w tej konkretnej scenie (może być alias, np. „Mia M.")
|
||||||
|
- performer.parent.id — kanoniczne UUID performerki w TPDB → external_id
|
||||||
|
- performer.parent.name / .extra.gender / .extra.birthday — kanoniczne metadane
|
||||||
|
|
||||||
|
Format studia: scene.site = {id, name, slug, parent: {...}, network: {...}}
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from datetime import date, datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
retry_if_exception,
|
||||||
|
retry_if_exception_type,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_retryable_http_error(exc: BaseException) -> bool:
|
||||||
|
"""Retry transport errors + 5xx + 429; NIE retry 4xx (404/422 = permanent).
|
||||||
|
|
||||||
|
401/403 NIE są retryowalne tutaj — TPDB token expiry musiałby być
|
||||||
|
obsłużony jako auth refresh (TODO gdyby zaczęły się pojawiać). Aktualnie
|
||||||
|
expire'a się raz na rok, więc nie warto kombinować.
|
||||||
|
"""
|
||||||
|
if isinstance(exc, httpx.TransportError):
|
||||||
|
return True
|
||||||
|
if isinstance(exc, httpx.HTTPStatusError):
|
||||||
|
sc = exc.response.status_code
|
||||||
|
return sc == 429 or sc >= 500
|
||||||
|
return False
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.connectors.base import (
|
||||||
|
BaseConnector,
|
||||||
|
RawPerformer,
|
||||||
|
RawScene,
|
||||||
|
RawStudio,
|
||||||
|
RawTag,
|
||||||
|
)
|
||||||
|
from app.models.source import SourceKind
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TPDBConnector(BaseConnector):
|
||||||
|
kind = SourceKind.tpdb
|
||||||
|
name = "tpdb"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
token: str | None = None,
|
||||||
|
base_url: str | None = None,
|
||||||
|
per_page: int = 100,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> None:
|
||||||
|
settings = get_settings()
|
||||||
|
self.token = token or settings.tpdb_api_token
|
||||||
|
if not self.token:
|
||||||
|
raise RuntimeError("TPDB_API_TOKEN is not set")
|
||||||
|
self.base_url = (base_url or settings.tpdb_base_url).rstrip("/")
|
||||||
|
self.per_page = per_page
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def _client(self) -> httpx.Client:
|
||||||
|
return httpx.Client(
|
||||||
|
base_url=self.base_url,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.token}",
|
||||||
|
"Accept": "application/json",
|
||||||
|
"User-Agent": "goon/0.1",
|
||||||
|
},
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
retry=retry_if_exception(_is_retryable_http_error),
|
||||||
|
wait=wait_exponential(multiplier=1, min=2, max=30),
|
||||||
|
stop=stop_after_attempt(5),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
def _get(self, client: httpx.Client, path: str, params: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
resp = client.get(path, params=params)
|
||||||
|
if resp.status_code == 429:
|
||||||
|
# let tenacity retry — but raise something it knows
|
||||||
|
raise httpx.HTTPStatusError("rate limited", request=resp.request, response=resp)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
def fetch_scenes(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
since: datetime | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
params: dict[str, Any] = {"per_page": self.per_page}
|
||||||
|
if since is not None:
|
||||||
|
params["date"] = since.date().isoformat()
|
||||||
|
|
||||||
|
yield from self._paginate_scenes(params, limit=limit)
|
||||||
|
|
||||||
|
def fetch_scenes_for_performer(
|
||||||
|
self,
|
||||||
|
performer_external_id: str,
|
||||||
|
*,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Pobiera wszystkie sceny TPDB dla performera o podanym kanonicznym ID.
|
||||||
|
|
||||||
|
TPDB API: GET /performers/<id>/scenes — dedykowany endpoint.
|
||||||
|
(Inne warianty są broken: /scenes?performers[]=<uuid> zwraca zawsze total=0,
|
||||||
|
/scenes?performer_id=<uuid> → 422.)
|
||||||
|
|
||||||
|
404 = performer usunięty z TPDB (np. b959ccbb 2026-05-16 Sentry GOON-N).
|
||||||
|
Wcześniej leciało raise → exception bąbelek do scheduler.performer_driven
|
||||||
|
→ cały run failed. Teraz warn + yield empty — caller widzi 0 scen i
|
||||||
|
kontynuuje z następnym performer.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
yield from self._paginate_scenes(
|
||||||
|
{"per_page": self.per_page},
|
||||||
|
limit=limit,
|
||||||
|
path=f"/performers/{performer_external_id}/scenes",
|
||||||
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
if e.response.status_code == 404:
|
||||||
|
log.warning(
|
||||||
|
"tpdb performer %s removed (404) — skipping",
|
||||||
|
performer_external_id,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
raise
|
||||||
|
|
||||||
|
def fetch_scenes_for_site(
|
||||||
|
self,
|
||||||
|
site_external_id: str,
|
||||||
|
*,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
"""Pobiera wszystkie sceny TPDB dla site/studio o podanym ID.
|
||||||
|
|
||||||
|
TPDB API: GET /sites/<id>/scenes — dedykowany endpoint analogiczny
|
||||||
|
do /performers/<id>/scenes. Bez paginacji limit zwraca total scenes
|
||||||
|
z meta.total (Brazzers=272, Naughty America=631 w czasie pisania).
|
||||||
|
|
||||||
|
404 = site usunięty z TPDB — analogicznie do fetch_scenes_for_performer.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
yield from self._paginate_scenes(
|
||||||
|
{"per_page": self.per_page},
|
||||||
|
limit=limit,
|
||||||
|
path=f"/sites/{site_external_id}/scenes",
|
||||||
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
if e.response.status_code == 404:
|
||||||
|
log.warning(
|
||||||
|
"tpdb site %s removed (404) — skipping",
|
||||||
|
site_external_id,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
raise
|
||||||
|
|
||||||
|
def find_performer_id_by_name(self, name: str) -> str | None:
|
||||||
|
"""GET /performers?q=<name> → pierwszy match. None gdy brak."""
|
||||||
|
with self._client() as client:
|
||||||
|
try:
|
||||||
|
payload = self._get(client, "/performers", {"q": name, "per_page": 5})
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
log.warning("tpdb /performers q=%s failed: %s", name, e)
|
||||||
|
return None
|
||||||
|
data = payload.get("data") or []
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
|
for item in data:
|
||||||
|
# exact (case-insensitive) match preferowany; fallback do pierwszego
|
||||||
|
if (item.get("name") or "").strip().lower() == name.strip().lower():
|
||||||
|
return str(item.get("id")) if item.get("id") else None
|
||||||
|
first = data[0]
|
||||||
|
return str(first.get("id")) if first.get("id") else None
|
||||||
|
|
||||||
|
def _paginate_scenes(
|
||||||
|
self,
|
||||||
|
params: dict[str, Any],
|
||||||
|
*,
|
||||||
|
limit: int | None,
|
||||||
|
path: str = "/scenes",
|
||||||
|
) -> Iterator[RawScene]:
|
||||||
|
emitted = 0
|
||||||
|
page = 1
|
||||||
|
with self._client() as client:
|
||||||
|
while True:
|
||||||
|
params["page"] = page
|
||||||
|
payload = self._get(client, path, params)
|
||||||
|
data = payload.get("data") or []
|
||||||
|
if not data:
|
||||||
|
return
|
||||||
|
for raw in data:
|
||||||
|
scene = _parse_scene(raw)
|
||||||
|
if scene is None:
|
||||||
|
continue
|
||||||
|
yield scene
|
||||||
|
emitted += 1
|
||||||
|
if limit is not None and emitted >= limit:
|
||||||
|
return
|
||||||
|
|
||||||
|
meta = payload.get("meta") or {}
|
||||||
|
last_page = meta.get("last_page") or page
|
||||||
|
if page >= last_page:
|
||||||
|
return
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date(value: Any) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
if isinstance(value, date):
|
||||||
|
return value
|
||||||
|
text = str(value).strip()
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
# TPDB dates: "YYYY-MM-DD" lub ISO datetime
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(text[:10])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_studio(raw: dict[str, Any] | None) -> RawStudio | None:
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
parent = raw.get("parent") or {}
|
||||||
|
network = raw.get("network") or {}
|
||||||
|
return RawStudio(
|
||||||
|
external_id=str(raw["id"]) if raw.get("id") is not None else None,
|
||||||
|
name=raw.get("name") or "Unknown",
|
||||||
|
slug=raw.get("short_name") or raw.get("slug"),
|
||||||
|
parent_external_id=str(parent["id"]) if parent.get("id") is not None else None,
|
||||||
|
parent_name=parent.get("name"),
|
||||||
|
network=network.get("name") if isinstance(network, dict) else None,
|
||||||
|
homepage_url=raw.get("url") or raw.get("home"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_performer(raw: dict[str, Any]) -> RawPerformer | None:
|
||||||
|
parent = raw.get("parent") or {}
|
||||||
|
extra = parent.get("extras") or parent.get("extra") or {}
|
||||||
|
canonical_id = parent.get("id") or raw.get("id")
|
||||||
|
canonical_name = parent.get("name") or raw.get("name")
|
||||||
|
if not canonical_name:
|
||||||
|
return None
|
||||||
|
aliases_field = parent.get("aliases") or extra.get("aliases") or []
|
||||||
|
if isinstance(aliases_field, str):
|
||||||
|
aliases = [a.strip() for a in aliases_field.split(",") if a.strip()]
|
||||||
|
else:
|
||||||
|
aliases = [a for a in aliases_field if isinstance(a, str)]
|
||||||
|
return RawPerformer(
|
||||||
|
external_id=str(canonical_id) if canonical_id is not None else None,
|
||||||
|
name=canonical_name,
|
||||||
|
aliases=aliases,
|
||||||
|
gender=(extra.get("gender") or parent.get("gender") or "").lower() or None,
|
||||||
|
birth_date=_parse_date(extra.get("birthday")),
|
||||||
|
country=extra.get("birthplace") or extra.get("country"),
|
||||||
|
as_alias_in_scene=raw.get("name") if raw.get("name") != canonical_name else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tag(raw: dict[str, Any]) -> RawTag | None:
|
||||||
|
name = raw.get("name")
|
||||||
|
if not name:
|
||||||
|
return None
|
||||||
|
return RawTag(
|
||||||
|
external_id=str(raw["id"]) if raw.get("id") is not None else None,
|
||||||
|
name=name,
|
||||||
|
slug=raw.get("slug"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_scene(raw: dict[str, Any]) -> RawScene | None:
|
||||||
|
external_id = raw.get("id")
|
||||||
|
title = raw.get("title")
|
||||||
|
if not external_id or not title:
|
||||||
|
log.warning("tpdb scene without id/title — skipping (keys=%s)", list(raw)[:8])
|
||||||
|
return None
|
||||||
|
|
||||||
|
performers: list[RawPerformer] = []
|
||||||
|
for p in raw.get("performers") or []:
|
||||||
|
parsed = _parse_performer(p)
|
||||||
|
if parsed is not None:
|
||||||
|
performers.append(parsed)
|
||||||
|
|
||||||
|
tags: list[RawTag] = []
|
||||||
|
for t in raw.get("tags") or []:
|
||||||
|
parsed_t = _parse_tag(t)
|
||||||
|
if parsed_t is not None:
|
||||||
|
tags.append(parsed_t)
|
||||||
|
|
||||||
|
|
||||||
|
return RawScene(
|
||||||
|
external_id=str(external_id),
|
||||||
|
title=title,
|
||||||
|
description=raw.get("description"),
|
||||||
|
release_date=_parse_date(raw.get("date")),
|
||||||
|
duration_sec=int(raw["duration"]) if raw.get("duration") else None,
|
||||||
|
code=raw.get("external_id"),
|
||||||
|
director=raw.get("director"),
|
||||||
|
url=raw.get("url"),
|
||||||
|
studio=_parse_studio(raw.get("site")),
|
||||||
|
performers=performers,
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=[], # TPDB nie publikuje pHashy w głównym endpoint
|
||||||
|
raw=raw,
|
||||||
|
)
|
||||||
35
app/db.py
Normal file
35
app/db.py
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import Session, sessionmaker
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
_settings = get_settings()
|
||||||
|
|
||||||
|
engine = create_engine(
|
||||||
|
_settings.database_url,
|
||||||
|
pool_pre_ping=True,
|
||||||
|
future=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
SessionLocal = sessionmaker(bind=engine, autoflush=False, expire_on_commit=False, future=True)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def session_scope() -> Iterator[Session]:
|
||||||
|
session = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield session
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_session() -> Iterator[Session]:
|
||||||
|
with session_scope() as session:
|
||||||
|
yield session
|
||||||
157
app/extractors/__init__.py
Normal file
157
app/extractors/__init__.py
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
"""Stream URL extractors per-tube.
|
||||||
|
|
||||||
|
Public API:
|
||||||
|
- `try_extract(sitetag, page_url) -> list[StreamSource] | None`
|
||||||
|
- `StreamSource` (dataclass)
|
||||||
|
- `HosterDead` (exception)
|
||||||
|
- `extract_stream_from_hoster(iframe_url, *, referer)` — generic packer-based hoster extract
|
||||||
|
- `fetch_tube_html(url)` — Chrome TLS fingerprint fetch (curl_cffi)
|
||||||
|
- `browser_get(url)` — low-level
|
||||||
|
|
||||||
|
Architektura: każdy tube ma osobny moduł `app.extractors.tubes.<tube>` który eksportuje
|
||||||
|
`extract(page_url) -> list[StreamSource] | None`. Registry niżej mapuje sitetag →
|
||||||
|
modułowy extractor. `try_extract()` to thin wrapper z exception handlingiem.
|
||||||
|
|
||||||
|
Po removalu porn-app dependency, ten moduł jest jedynym mechanizmem rozwiązywania
|
||||||
|
streamów — playback.py nie wpada już do porn-app /stream API.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
from app.extractors._fetch import browser_get, fetch_tube_html
|
||||||
|
from app.extractors._models import HosterDead, StreamSource, TubePageError
|
||||||
|
from app.extractors.hoster import extract_stream_from_hoster, unpack_packer
|
||||||
|
from app.extractors.tubes import (
|
||||||
|
_embed_iframe,
|
||||||
|
_vps_blocked_fallback,
|
||||||
|
_ytdlp,
|
||||||
|
eporner,
|
||||||
|
freshporno,
|
||||||
|
hqporner,
|
||||||
|
latestpornvideo,
|
||||||
|
paradisehill,
|
||||||
|
porn00,
|
||||||
|
pornhat,
|
||||||
|
pornxp,
|
||||||
|
sxyprn,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Sitetag → extractor function. Sitetag pasuje do format'u z origin: `pornapp:<sitetag>`
|
||||||
|
# (lub po Fazie 2 migracji: `tube:<sitetag>`).
|
||||||
|
#
|
||||||
|
# Mainstream tubes (pornhub/xvideos/xnxx/xhamster/redtube/youporn/porntrex) używają
|
||||||
|
# yt-dlp jako extractor — battle-tested, aktualizowane przez upstream przy zmianach
|
||||||
|
# HTML. Aggregator tubes (xmoviesforyou/watchporn/siska/...) używają generic
|
||||||
|
# embed-iframe extractor (page → /e/<id> iframe → P.A.C.K.E.R. unpack). Custom kod
|
||||||
|
# tylko tam gdzie tube ma niestandardowy schemat (eporner XHR, sxyprn URL transform).
|
||||||
|
_REGISTRY: dict[str, Callable[[str], list[StreamSource] | None]] = {
|
||||||
|
# Custom (zoptymalizowane / niestandardowy player)
|
||||||
|
# hqporner — CDN URL (bigcdn.cc, video.flyflv.com z `ip=` parametrem) IP-bound do
|
||||||
|
# requestera. VPS resolve daje 200 ale mobile direct = 404/403. Switch na WebView
|
||||||
|
# fallback: mobile pobiera embed iframe (mydaddy.cc/hqwo.cc) z phone IP, FluidPlayer
|
||||||
|
# JS decoduje mp4 URL z mobile session. Plus INJECTED_JS skanuje `<source>.src`.
|
||||||
|
# ~32k scen (drugi po porntrex największy single saving). Verified 2026-05-18.
|
||||||
|
"hqpornercom": _vps_blocked_fallback.extract,
|
||||||
|
"epornercom": eporner.extract,
|
||||||
|
"sxyprncom": sxyprn.extract,
|
||||||
|
# Mainstream tubes — yt-dlp
|
||||||
|
# NB: 2026-05-18 cross-IP test potwierdził że xvideos/xnxx/pornhub/youporn/redtube
|
||||||
|
# CDN URLs są **time-bound** (nie IP-bound) — mobile_direct_ok auto-detect w
|
||||||
|
# playback.py daje mobile direct fetch, zero VPS bandwidth.
|
||||||
|
"pornhubcom": _ytdlp.extract,
|
||||||
|
"redtubecom": _ytdlp.extract,
|
||||||
|
"xvideoscom": _ytdlp.extract,
|
||||||
|
"xnxxcom": _ytdlp.extract,
|
||||||
|
"youporncom": _ytdlp.extract,
|
||||||
|
# porntrex KVS get_file — `kt_ips=<vps_ip>` cookie + single-use token (410 po reuse).
|
||||||
|
# CDN IP-bound do VPS, mobile direct = 403. Switch na _vps_blocked_fallback:
|
||||||
|
# mobile WebView z phone IP → KVS player JS dekoduje video.src → INJECTED_JS scrape.
|
||||||
|
# 137k scen oszczędzone z VPS bandwidth (largest single saving).
|
||||||
|
"porntrexcom": _vps_blocked_fallback.extract,
|
||||||
|
# VPS-blocked tubes — KVS / Cloudflare blokuje Hetzner IP, ale działają z residential
|
||||||
|
# IP (potwierdzone Chrome DevTools MCP 2026-05-15). Mobile WebView + INJECTED_JS
|
||||||
|
# (PlayerScreen.tsx:805) skanuje <video>.src + XHR — łapie URL po decode-ie player JS.
|
||||||
|
"xhamstercom": _vps_blocked_fallback.extract,
|
||||||
|
"porndittcom": _vps_blocked_fallback.extract,
|
||||||
|
"fpoxxx": _vps_blocked_fallback.extract,
|
||||||
|
"sxylandcom": _vps_blocked_fallback.extract,
|
||||||
|
# Aggregator tubes — generic embed-iframe → hoster unpacker
|
||||||
|
"latestpornvideocom": latestpornvideo.extract,
|
||||||
|
"xmoviesforyoucom": _embed_iframe.extract,
|
||||||
|
"watchporn": _embed_iframe.extract,
|
||||||
|
"siskavideo": _embed_iframe.extract,
|
||||||
|
"porn4dayspw": _embed_iframe.extract,
|
||||||
|
"porndishcom": _embed_iframe.extract,
|
||||||
|
# xxxfreewatch — DELISTED 2026-05-18. 790 solo-orphan scen, 0% match, CF-walled z VPS.
|
||||||
|
"latestleaksco": _embed_iframe.extract,
|
||||||
|
"mypornerleakcom": _embed_iframe.extract,
|
||||||
|
# PornHat — dedicated extractor: tylko `<source>` z player area (skip sidebar
|
||||||
|
# trailer URLs `_preview*.mp4`), dedupe po filename. Get_file 302 → CDN, proxy
|
||||||
|
# follow_redirects=True wymagane (fix w stream_proxy.py).
|
||||||
|
"pornhatcom": pornhat.extract,
|
||||||
|
# Freshporno KVS — `cv=` HMAC signed token IP-bound. Server-side resolve dało
|
||||||
|
# 200 z VPS, ale laptop dostał 302+SSL error → token validate'uje requester IP.
|
||||||
|
# Switch na WebView fallback: mobile pobiera embed page, KVS player decoduje
|
||||||
|
# video_url w-page, ExoPlayer dostaje URL z phone session. ~15k scen.
|
||||||
|
"freshpornoorg": _vps_blocked_fallback.extract,
|
||||||
|
# porn00 / pornxp — force_proxy=True wprost (IP-bound CDN). Switch na WebView
|
||||||
|
# fallback. Niski volume (84 scen), trivial saving ale konsystencja flow.
|
||||||
|
"porn00org": _vps_blocked_fallback.extract,
|
||||||
|
"pornxpph": _vps_blocked_fallback.extract,
|
||||||
|
# Direct-scraping tubes (mają też search scraper w connectors/direct_scrapers/)
|
||||||
|
# — używają identycznego embed-iframe pattern dla streamingu.
|
||||||
|
# hdporn92com — DELISTED 2026-05-18. Scene pages to SEO shell bez player iframe,
|
||||||
|
# JS hijackuje kliki na popunder. Wszystkie playback_sources mass-marked dead.
|
||||||
|
# 0dayxx wraps watchporn.to embed. watchporn.to/get_file/ token IP-bound (302→410
|
||||||
|
# cross-IP). Switch na WebView fallback. ~5k scen.
|
||||||
|
"0dayxxcom": _vps_blocked_fallback.extract,
|
||||||
|
# CF-protected tube — curl_cffi w fetch_tube_html bypassa JA3, embed-iframe pattern.
|
||||||
|
"perverzijacom": _embed_iframe.extract,
|
||||||
|
# Special: WebView-only (Yii2 session-bound player).
|
||||||
|
"paradisehillcc": paradisehill.extract,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def try_extract(sitetag: str, page_url: str) -> list[StreamSource] | None:
|
||||||
|
"""Próbuje rozwiązać stream URL dla danego tube'a + page_url.
|
||||||
|
|
||||||
|
Zwraca listę StreamSource (różne quality/kontener) lub None gdy:
|
||||||
|
- brak extractora dla tego sitetag
|
||||||
|
- extractor zwrócił None / nie znalazł URL'a
|
||||||
|
|
||||||
|
Raises HosterDead gdy embed page wprost mówi że video deleted/not found —
|
||||||
|
caller (playback.py) łapie i oznacza playback_source.dead_at.
|
||||||
|
"""
|
||||||
|
extractor = _REGISTRY.get(sitetag)
|
||||||
|
if extractor is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return extractor(page_url)
|
||||||
|
except (HosterDead, TubePageError):
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("extractor for %s failed on %s: %s", sitetag, page_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def supported_sitetags() -> tuple[str, ...]:
|
||||||
|
"""Zwraca listę sitetag-ów które mają zarejestrowany extractor."""
|
||||||
|
return tuple(_REGISTRY.keys())
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"try_extract",
|
||||||
|
"supported_sitetags",
|
||||||
|
"StreamSource",
|
||||||
|
"HosterDead",
|
||||||
|
"TubePageError",
|
||||||
|
"extract_stream_from_hoster",
|
||||||
|
"unpack_packer",
|
||||||
|
"fetch_tube_html",
|
||||||
|
"browser_get",
|
||||||
|
]
|
||||||
120
app/extractors/_fetch.py
Normal file
120
app/extractors/_fetch.py
Normal file
|
|
@ -0,0 +1,120 @@
|
||||||
|
"""Browser-impersonation HTTP fetcher dla tube'ów blokujących Pythonowy TLS fingerprint.
|
||||||
|
|
||||||
|
Niektóre Cloudflare-fronted tube'y (np. perverzija) blokują httpx na podstawie JA3
|
||||||
|
TLS hash (charakterystycznego dla Pythonowego stacka), zwracając 403 nawet z dobrym
|
||||||
|
UA + Referer. `curl_cffi` używa libcurl + skompilowanej wersji TLS lib z prawdziwego
|
||||||
|
Chrome'a, dzięki czemu ja3 hash jest identyczny jak browser → CF wpuszcza.
|
||||||
|
|
||||||
|
Fallback na httpx tylko gdy curl_cffi nie zainstalowany (zachowujemy backwards-compat
|
||||||
|
w razie problemów z buildem libcurl-impersonate).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.extractors._models import TubePageError
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from curl_cffi import requests as _cf_requests # type: ignore[import-not-found]
|
||||||
|
_HAS_CURL_CFFI = True
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
_HAS_CURL_CFFI = False
|
||||||
|
log.warning("curl_cffi not installed — fallback to httpx (CF-protected tubes will fail)")
|
||||||
|
|
||||||
|
|
||||||
|
_DEFAULT_IMPERSONATE = "chrome120"
|
||||||
|
_DEFAULT_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FetchResult:
|
||||||
|
"""Mini response-like object — drop-in dla httpx.Response w naszych use case'ach."""
|
||||||
|
|
||||||
|
status_code: int
|
||||||
|
text: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
def raise_for_status(self) -> None:
|
||||||
|
if 400 <= self.status_code < 600:
|
||||||
|
raise TubePageError(self.status_code, self.url)
|
||||||
|
|
||||||
|
|
||||||
|
def browser_get(
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
headers: Mapping[str, str] | None = None,
|
||||||
|
timeout: float = 60.0,
|
||||||
|
follow_redirects: bool = True,
|
||||||
|
impersonate: str = _DEFAULT_IMPERSONATE,
|
||||||
|
) -> FetchResult:
|
||||||
|
"""GET z Chrome TLS fingerprint (curl_cffi). Spada do httpx gdy curl_cffi brak."""
|
||||||
|
if not _HAS_CURL_CFFI:
|
||||||
|
with httpx.Client(timeout=timeout, follow_redirects=follow_redirects) as http:
|
||||||
|
r = http.get(url, headers=dict(headers or {}))
|
||||||
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
||||||
|
|
||||||
|
r = _cf_requests.get(
|
||||||
|
url,
|
||||||
|
headers=dict(headers or {}),
|
||||||
|
timeout=timeout,
|
||||||
|
impersonate=impersonate,
|
||||||
|
allow_redirects=follow_redirects,
|
||||||
|
)
|
||||||
|
return FetchResult(status_code=r.status_code, text=r.text, url=str(r.url))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tube_html(url: str, *, timeout: float = 60.0, max_retries: int = 2) -> str:
|
||||||
|
"""Fetch HTML strony tube'a z Chrome UA + retry dla transient failures.
|
||||||
|
|
||||||
|
Standalone replacement dla `PornAppClient.fetch_tube_html`. Używa curl_cffi
|
||||||
|
(browser_get) żeby ominąć JA3 fingerprint blocks na CF-fronted tube'ach.
|
||||||
|
|
||||||
|
Retry: 5xx i empty body retry max_retries razy z exponential backoff (0.5s, 1s).
|
||||||
|
Dla freshporno itp. które czasem zwracają 503/empty — bez retry user dostawał
|
||||||
|
"extractor None" z transient hiccup.
|
||||||
|
"""
|
||||||
|
import time as _time
|
||||||
|
host = urlparse(url).hostname or ""
|
||||||
|
headers = {
|
||||||
|
"User-Agent": _DEFAULT_UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"x-site": host,
|
||||||
|
}
|
||||||
|
last_err: Exception | None = None
|
||||||
|
for attempt in range(max_retries + 1):
|
||||||
|
try:
|
||||||
|
resp = browser_get(url, headers=headers, timeout=timeout, follow_redirects=True)
|
||||||
|
except Exception as e:
|
||||||
|
last_err = e
|
||||||
|
log.info("fetch_tube_html attempt %d/%d for %s: %s", attempt + 1, max_retries + 1, url, e)
|
||||||
|
if attempt < max_retries:
|
||||||
|
_time.sleep(0.5 * (attempt + 1))
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
# Retry on 5xx (transient server error) lub puste body (CDN cache miss)
|
||||||
|
if 500 <= resp.status_code < 600 or (resp.status_code == 200 and len(resp.text) < 500):
|
||||||
|
if attempt < max_retries:
|
||||||
|
log.info("fetch_tube_html %s attempt %d/%d: status=%d len=%d — retry",
|
||||||
|
url, attempt + 1, max_retries + 1, resp.status_code, len(resp.text))
|
||||||
|
_time.sleep(0.5 * (attempt + 1))
|
||||||
|
continue
|
||||||
|
if resp.status_code >= 400:
|
||||||
|
raise TubePageError(resp.status_code, url)
|
||||||
|
return resp.text
|
||||||
|
if last_err:
|
||||||
|
raise last_err
|
||||||
|
raise TubePageError(0, url)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["browser_get", "fetch_tube_html", "FetchResult", "_DEFAULT_UA"]
|
||||||
48
app/extractors/_models.py
Normal file
48
app/extractors/_models.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
"""Stream source DTO + wspólne wyjątki extractorów."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StreamSource:
|
||||||
|
"""Pojedynczy resolved stream URL.
|
||||||
|
|
||||||
|
Mapuje na `StreamLink` w playback API (api/playback.py) — `link` → `stream_url`,
|
||||||
|
`quality` → `quality`, `type` → `type`.
|
||||||
|
|
||||||
|
`referer` — opcjonalny override Referera używanego przez stream_proxy. Niektóre
|
||||||
|
CDN-y (KVS-style watchporn.to, fpo.xxx itp.) zwracają 410/403 gdy Referer nie
|
||||||
|
pasuje do *embed page'a* (np. proxy używa `Referer: 0dayxx.com` ale CDN expectuje
|
||||||
|
`Referer: watchporn.to`). Gdy None → caller (playback.py) używa `page_url`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
link: str
|
||||||
|
quality: str | None = None
|
||||||
|
type: str | None = None # 'mp4' | 'm3u8' | 'mpd' | 'hoster'
|
||||||
|
raw: dict[str, Any] | None = None
|
||||||
|
referer: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class HosterDead(Exception):
|
||||||
|
"""Hoster embed page mówi że video jest skasowane / nie istnieje.
|
||||||
|
|
||||||
|
Caller w playback.py łapie i oznacza `playback_source.dead_at`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TubePageError(Exception):
|
||||||
|
"""Tube page fetch zwrócił HTTP error (404/410/5xx).
|
||||||
|
|
||||||
|
Caller (playback.py) może oznaczyć dead_at jeśli 404/410. Trzymamy `status_code`
|
||||||
|
+ `url` w atrybutach żeby caller nie musiał parsować message stringa.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, status_code: int, url: str):
|
||||||
|
super().__init__(f"HTTP {status_code} for {url}")
|
||||||
|
self.status_code = status_code
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["StreamSource", "HosterDead", "TubePageError"]
|
||||||
91
app/extractors/duration_extract.py
Normal file
91
app/extractors/duration_extract.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
"""Universal duration extractor for tube pages.
|
||||||
|
|
||||||
|
Direct scrapery (xvideos, xnxx, youporn, porntrex, …) są search-only — pobierają
|
||||||
|
listing i wycioskują tylko URL + slug-as-title. Duration pojawia się dopiero na
|
||||||
|
detail page i jest dostępne w jednym z patternów:
|
||||||
|
|
||||||
|
1. **OpenGraph numeric** (youporn, redtube, eporner):
|
||||||
|
`<meta property="og:video:duration" content="992">` — sekundy.
|
||||||
|
2. **OpenGraph ISO 8601** (rzadkie):
|
||||||
|
`<meta property="og:video:duration" content="PT16M32S">`.
|
||||||
|
3. **Schema.org VideoObject LD-JSON** (xvideos, xnxx, KVS-based):
|
||||||
|
`"duration": "PT00H07M10S"` w JSON-LD `<script type="application/ld+json">`.
|
||||||
|
4. **itemprop microdata** (sxyland, 0dayxx, niektóre WordPress):
|
||||||
|
`<meta itemprop="duration" content="P0DT0H21M13S">` — ISO 8601 z opcjonalnym
|
||||||
|
`P<days>D` prefix + opcjonalnym `T` blokiem HMS.
|
||||||
|
|
||||||
|
Funkcja zwraca pierwszy znaleziony match jako int seconds, lub None.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
_OG_DURATION_RE = re.compile(
|
||||||
|
r'<meta\s+property="(?:og:(?:video:)?|video:)duration"\s+content="([^"]+)"',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_LD_DURATION_RE = re.compile(r'"duration"\s*:\s*"(P[0-9DTHMS]+)"', re.IGNORECASE)
|
||||||
|
_ITEMPROP_DURATION_RE = re.compile(
|
||||||
|
r'itemprop="duration"[^>]*content="([^"]+)"', re.IGNORECASE
|
||||||
|
)
|
||||||
|
# Hqporner-style meta description: "Video duration is 6min 55sec" lub "1h 23min 5sec".
|
||||||
|
# Generic — pasuje też do innych tube'ów które dorzucają w meta opis duration prozą.
|
||||||
|
_META_DESC_DURATION_RE = re.compile(
|
||||||
|
r'(?:duration\s+is\s+|<meta\s+name="description"\s+content="[^"]*duration\s+is\s+)'
|
||||||
|
r'(?:(\d+)\s*h(?:our)?s?)?\s*(?:(\d+)\s*min)?\s*(?:(\d+)\s*sec)?',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Generalized ISO 8601: P[<n>D][T[<n>H][<n>M][<n>S]]. Pokrywa `PT16M32S`,
|
||||||
|
# `PT00H07M10S`, `P0DT0H21M13S` jednocześnie. Dni są rzadko sensowne (>24h scena),
|
||||||
|
# ale zachowujemy bo niektóre tube'y wpisują P0D dla porządku.
|
||||||
|
_ISO_DURATION_RE = re.compile(
|
||||||
|
r"^P(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$", re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso8601(value: str) -> int | None:
|
||||||
|
"""`P0DT0H21M13S` → 1273, `PT00H07M10S` → 430. None gdy format niepasujący
|
||||||
|
LUB total == 0 (sygnał placeholder bez duration)."""
|
||||||
|
m = _ISO_DURATION_RE.match(value.strip())
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
d, h, mi, s = (int(g) if g else 0 for g in m.groups())
|
||||||
|
total = d * 86400 + h * 3600 + mi * 60 + s
|
||||||
|
return total if total > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_duration_sec(html: str) -> int | None:
|
||||||
|
"""Zwraca duration w sekundach lub None gdy żaden wzorzec nie pasuje.
|
||||||
|
|
||||||
|
Kolejność: OG numeric → OG ISO → LD-JSON ISO → itemprop ISO. Pierwsze pasujące
|
||||||
|
z `total > 0` wygrywa.
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if (m := _OG_DURATION_RE.search(html)):
|
||||||
|
v = m.group(1).strip()
|
||||||
|
if v.isdigit():
|
||||||
|
n = int(v)
|
||||||
|
if n > 0:
|
||||||
|
return n
|
||||||
|
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
if (m := _LD_DURATION_RE.search(html)):
|
||||||
|
if (parsed := _parse_iso8601(m.group(1))) is not None:
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
if (m := _ITEMPROP_DURATION_RE.search(html)):
|
||||||
|
v = m.group(1).strip()
|
||||||
|
if v.upper().startswith("P") and (parsed := _parse_iso8601(v)) is not None:
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
# Hqporner: "Video duration is 6min 55sec" w meta description.
|
||||||
|
if (m := _META_DESC_DURATION_RE.search(html)):
|
||||||
|
h, mi, s = (int(g) if g else 0 for g in m.groups())
|
||||||
|
total = h * 3600 + mi * 60 + s
|
||||||
|
if total > 0:
|
||||||
|
return total
|
||||||
|
|
||||||
|
return None
|
||||||
343
app/extractors/hoster.py
Normal file
343
app/extractors/hoster.py
Normal file
|
|
@ -0,0 +1,343 @@
|
||||||
|
"""Generic hoster (StreamWish/doodporn/mixdrop/filemoon/luluvdo) stream URL extractor.
|
||||||
|
|
||||||
|
Hostery embed-page'y stosują JWPlayer + P.A.C.K.E.R. obfuskację:
|
||||||
|
eval(function(p,a,c,k,e,d){...}('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'),...))
|
||||||
|
i chowają `sources: [{file: "https://...m3u8"}]` w packed JS.
|
||||||
|
|
||||||
|
Tu jest:
|
||||||
|
- `unpack_packer(js)` — dekoder P.A.C.K.E.R.
|
||||||
|
- `extract_stream_from_hoster(iframe_url, *, referer)` — fetch embed → unpack → m3u8/mp4
|
||||||
|
|
||||||
|
Te funkcje są używane przez:
|
||||||
|
1. Per-tube extractors (latestpornvideo, hqporner fallback) — page → embed iframe → tu
|
||||||
|
2. Movies playback (api/playback.py movies_router) — direct hoster URL → tu
|
||||||
|
|
||||||
|
Nie ma już zależności od PornAppClient / porn-app API.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.extractors._fetch import _DEFAULT_UA, browser_get
|
||||||
|
from app.extractors._models import HosterDead
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# P.A.C.K.E.R. javascript unpacker — odwraca obfuskację wzorca:
|
||||||
|
# eval(function(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(...);return p}
|
||||||
|
# ('PAYLOAD', BASE, COUNT, 'kw1|kw2|...'.split('|'), 0, {}))
|
||||||
|
# StreamWish, doodporn, mixdrop, filemoon — wszystkie używają tego packera do schowania
|
||||||
|
# `sources: [{file: "https://...m3u8"}]` w JWPlayer config.
|
||||||
|
_PACKER_ARGS_RE = re.compile(
|
||||||
|
r"\}\s*\(\s*'((?:\\'|[^'])+)'\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*'((?:\\'|[^'])*)'\s*\.split\('\|'\)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _base_n(token: str, base: int) -> int | None:
|
||||||
|
"""Parsuje token jako liczbę w bazie 'base' (max 62 dla a-zA-Z0-9)."""
|
||||||
|
try:
|
||||||
|
result = 0
|
||||||
|
for ch in token:
|
||||||
|
if ch.isdigit():
|
||||||
|
d = ord(ch) - ord("0")
|
||||||
|
elif "a" <= ch <= "z":
|
||||||
|
d = ord(ch) - ord("a") + 10
|
||||||
|
elif "A" <= ch <= "Z":
|
||||||
|
d = ord(ch) - ord("A") + 36
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
if d >= base:
|
||||||
|
return None
|
||||||
|
result = result * base + d
|
||||||
|
return result
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def unpack_packer(js: str) -> str | None:
|
||||||
|
"""Unpack P.A.C.K.E.R. obfuscated JS. Zwraca None gdy wzorca nie ma."""
|
||||||
|
m = _PACKER_ARGS_RE.search(js)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
payload, base_str, count_str, kw_str = m.groups()
|
||||||
|
base = int(base_str)
|
||||||
|
count = int(count_str)
|
||||||
|
keywords = kw_str.split("|")
|
||||||
|
payload = payload.replace("\\'", "'").replace('\\"', '"').replace("\\\\", "\\")
|
||||||
|
|
||||||
|
def replace_token(match: re.Match[str]) -> str:
|
||||||
|
token = match.group(0)
|
||||||
|
idx = _base_n(token, base)
|
||||||
|
if idx is None or idx >= count or idx >= len(keywords):
|
||||||
|
return token
|
||||||
|
kw = keywords[idx]
|
||||||
|
return kw if kw else token
|
||||||
|
|
||||||
|
return re.sub(r"\b\w+\b", replace_token, payload)
|
||||||
|
|
||||||
|
|
||||||
|
_HOSTER_FILE_RE = re.compile(
|
||||||
|
r'(?:["\']?file["\']?|sources?)\s*[:=]\s*["\'](https?://[^"\']+\.(?:m3u8|mp4|mpd)[^"\']*)["\']',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Ad-rolls embedded w player config (xtremestream.xyz, niektóre KVS forki).
|
||||||
|
# Bez filtra extractor wracał preroll.mp4 jako "scena" → user widział 20s reklamy
|
||||||
|
# zamiast filmu (zgłoszone 2026-05-10, bug-report #30c4d3cf perverzija).
|
||||||
|
# Pattern obejmuje typowe nazwy ad-rolli + CDN-y które serwują reklamy
|
||||||
|
# (opencdn.b-cdn.net to bunnycdn alias dla reklam).
|
||||||
|
_AD_VIDEO_RE = re.compile(
|
||||||
|
r"/(?:preroll|midroll|postroll|preplay|ads?|advert|promo)\d*\.(?:mp4|m3u8|webm)"
|
||||||
|
r"|opencdn\.b-cdn\.net/video/(?:pre|mid|post|ad)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_ad(url: str) -> bool:
|
||||||
|
return bool(_AD_VIDEO_RE.search(url))
|
||||||
|
|
||||||
|
# Niektóre hostery (doodporn) chowają mp4/m3u8 w słowniku zmiennych i odwołują się do
|
||||||
|
# nich w `sources: [{file: links.hls2}]`. Wtedy regex powyżej nie złapie. Drugi pass
|
||||||
|
# bierze pierwszy `.m3u8|.mp4|.mpd` URL z całego unpacked HTML — heurystyka, ale
|
||||||
|
# pierwszy taki URL to zwykle master playlist video.
|
||||||
|
_HOSTER_FALLBACK_URL_RE = re.compile(
|
||||||
|
r'https?://[^\s"\'<>]+\.(?:m3u8|mp4|mpd)(?:\?[^\s"\'<>]*)?',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Sygnatury "video not found" / "deleted" które hostery wstawiają w HTML embed page.
|
||||||
|
# Gdy widzimy te markery, to wiemy że link jest martwy — raise HosterDead, caller w
|
||||||
|
# playback.py oznaczy playback_source.dead_at.
|
||||||
|
_HOSTER_DEAD_PATTERNS = (
|
||||||
|
"Video not found",
|
||||||
|
"video not found",
|
||||||
|
"Video Not Found",
|
||||||
|
"File was deleted",
|
||||||
|
"video is deleted",
|
||||||
|
"Video is deleted",
|
||||||
|
"This video is no longer available",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# KVS (Kernel Video Sharing) player markers — kt_player.js + license_code w HTML.
|
||||||
|
# Używają go fpo.xxx, 0day.kim, hdporn92, sxyland, i wiele innych WordPress-based
|
||||||
|
# tubes. KVS encryptuje URL `function/0/<encrypted>` license_code'em — regex fallback
|
||||||
|
# (`_HOSTER_FALLBACK_URL_RE`) złapie zamiast tego URL `event_reporting2` (tracking
|
||||||
|
# pixel zwracający 1×1 GIF zamiast video). Jak widzimy markery KVS, idziemy od razu
|
||||||
|
# do yt-dlp którego generic extractor poprawnie deszyfruje URL.
|
||||||
|
_KVS_MARKERS = ("kt_player(", "license_code")
|
||||||
|
|
||||||
|
|
||||||
|
# File hosters / known dead — rapidgator/nitroflare/frdl wymagają premium account
|
||||||
|
# (zwracają HTML z formularzem logowania zamiast video). Zwróć None bez fetch'u —
|
||||||
|
# caller w movies playback dorzuci embed-only fallback i mobile i tak otworzy
|
||||||
|
# WebView (gdzie user może zalogować się premium jeśli chce).
|
||||||
|
# Streamtape USUNIĘTY z blacklistu 2026-05-15 — ma dedicated extractor (innerHTML
|
||||||
|
# substring decode → /get_video → 302 → tapecontent.net mp4). Większość 12k URLów
|
||||||
|
# w naszej DB jest DMCA-dead ale ~5% żyje.
|
||||||
|
_FILE_HOSTER_RE = re.compile(
|
||||||
|
r"(?:rapidgator|nitroflare|filer\.net|frdl\.[a-z]+|"
|
||||||
|
r"streamcrypt\.net|"
|
||||||
|
r"openload\.co|openload\.io|oload\.[a-z]+)", # openload offline od 2019
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_stream_from_hoster(
|
||||||
|
iframe_url: str,
|
||||||
|
*,
|
||||||
|
referer: str,
|
||||||
|
timeout: float = 60.0,
|
||||||
|
) -> str | None:
|
||||||
|
"""Fetch hoster embed HTML → unpack P.A.C.K.E.R. JS → wyłuskaj video URL.
|
||||||
|
|
||||||
|
Działa dla większości popularnych hosterów (StreamWish, doodporn, mixdrop, filemoon)
|
||||||
|
bo wszyscy oni hostują JWPlayer z `sources` w packed JS. Zwraca pierwszy znaleziony
|
||||||
|
URL .m3u8 / .mp4 / .mpd lub None gdy nie udało się wyciągnąć.
|
||||||
|
|
||||||
|
Raises HosterDead gdy embed page wprost mówi że video deleted/not found.
|
||||||
|
"""
|
||||||
|
if _FILE_HOSTER_RE.search(iframe_url):
|
||||||
|
log.debug("hoster %s: file-hoster blacklist (premium-walled), skipping", iframe_url)
|
||||||
|
return None
|
||||||
|
# Per-hoster dedicated extractors (specific URL shapes / decode patterns).
|
||||||
|
# Mixdrop: P.A.C.K.E.R. → MDCore.wurl protocol-relative `//host/v2/<id>.mp4?s=...`
|
||||||
|
# — generic packer fallback regex `https?://...\.mp4` mija ten URL (no scheme).
|
||||||
|
if re.search(r"(?:mixdrop|m1xdrop|mxdrop)\.[a-z]+/", iframe_url, re.IGNORECASE):
|
||||||
|
from app.extractors.hosters import mixdrop
|
||||||
|
sources = mixdrop.extract(iframe_url, timeout=timeout)
|
||||||
|
if sources:
|
||||||
|
return sources[0].link
|
||||||
|
# Fall through to generic logic gdyby dedicated zwrócił None.
|
||||||
|
# Streamtape: 4 `document.getElementById(...).innerHTML = prefix + (...).substring(N)`
|
||||||
|
# assignmenty, z czego 2 są DECOY z połamanym hostname. Dedicated decode picks
|
||||||
|
# correct one + builds `/get_video?id=...&token=...` URL.
|
||||||
|
if re.search(r"streamtape\.[a-z]+/", iframe_url, re.IGNORECASE):
|
||||||
|
from app.extractors.hosters import streamtape
|
||||||
|
sources = streamtape.extract(iframe_url, timeout=timeout)
|
||||||
|
if sources:
|
||||||
|
return sources[0].link
|
||||||
|
return None # streamtape ma własną HosterDead obsługę — generic fallback by się sypał
|
||||||
|
# Shared SPA+AES-CBC engine: embedseek/seekplayer/rpmplay/upns/player4me/easyvidplayer
|
||||||
|
# — wszystkie używają tego samego silnika (`/api/v1/video` z AES-CBC encrypted
|
||||||
|
# m3u8 source). Razem ~159k playback sources w DB.
|
||||||
|
from app.extractors.hosters import seekplayer_engine
|
||||||
|
if seekplayer_engine.matches(iframe_url):
|
||||||
|
sources = seekplayer_engine.extract(iframe_url, timeout=timeout)
|
||||||
|
if sources:
|
||||||
|
return sources[0].link
|
||||||
|
return None
|
||||||
|
# voe.sx: JS redirect do losowego mirroru → custom 7-step decoder
|
||||||
|
# (ROT13 → strip 7 magic seps → atob → -3 shift → reverse → atob → JSON.parse)
|
||||||
|
# → HLS m3u8 + mp4 fallback. ~21k movies.
|
||||||
|
if re.search(
|
||||||
|
r"//(?:voe\.sx|"
|
||||||
|
r"rebeccasciencestreet\.[a-z]+|"
|
||||||
|
r"darnobedienceupscale\.[a-z]+|"
|
||||||
|
r"[a-z]+upscale\.com|[a-z]+street\.com)/",
|
||||||
|
iframe_url,
|
||||||
|
re.IGNORECASE,
|
||||||
|
):
|
||||||
|
from app.extractors.hosters import voe
|
||||||
|
sources = voe.extract(iframe_url, timeout=timeout)
|
||||||
|
if sources:
|
||||||
|
return sources[0].link
|
||||||
|
return None
|
||||||
|
headers = {
|
||||||
|
"User-Agent": _DEFAULT_UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Referer": referer,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
r = browser_get(iframe_url, headers=headers, timeout=timeout, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("hoster fetch %s failed: %s", iframe_url, e)
|
||||||
|
return None
|
||||||
|
html = r.text
|
||||||
|
|
||||||
|
if any(p in html for p in _HOSTER_DEAD_PATTERNS):
|
||||||
|
raise HosterDead(f"hoster {iframe_url} reports video deleted/not found")
|
||||||
|
|
||||||
|
def _first_non_ad(pattern: re.Pattern[str], text: str, group: int = 1) -> str | None:
|
||||||
|
"""Iterate matches, pomiń preroll/ad URLs. Zwraca pierwszy clean lub None."""
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
url = m.group(group)
|
||||||
|
if not _looks_like_ad(url):
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 1) Direct match w raw HTML (gdy hoster nie zaobfuskował)
|
||||||
|
if (url := _first_non_ad(_HOSTER_FILE_RE, html, 1)):
|
||||||
|
return url
|
||||||
|
|
||||||
|
# KVS player → idź od razu do yt-dlp żeby ominąć regex-fallback który łapie
|
||||||
|
# gif-trap URL `event_reporting2`. yt-dlp generic deszyfruje `function/0/<enc>`
|
||||||
|
# license_code'em i zwraca prawdziwy `get_file/<N>/...mp4` URL.
|
||||||
|
is_kvs = all(marker in html for marker in _KVS_MARKERS)
|
||||||
|
if is_kvs:
|
||||||
|
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
|
||||||
|
if ytdlp_url and not _looks_like_ad(ytdlp_url):
|
||||||
|
return ytdlp_url
|
||||||
|
log.warning("hoster %s: KVS markers but yt-dlp failed", iframe_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 2) Unpack P.A.C.K.E.R. → match na unpacked, najpierw structurally,
|
||||||
|
# potem fallback na pierwszy m3u8/mp4 w stringu.
|
||||||
|
unpacked = unpack_packer(html)
|
||||||
|
if unpacked:
|
||||||
|
if (url := _first_non_ad(_HOSTER_FILE_RE, unpacked, 1)):
|
||||||
|
return url
|
||||||
|
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, unpacked, 0)):
|
||||||
|
return url
|
||||||
|
|
||||||
|
# 3) Fallback na raw HTML (URL może być poza packerem)
|
||||||
|
if (url := _first_non_ad(_HOSTER_FALLBACK_URL_RE, html, 0)):
|
||||||
|
return url
|
||||||
|
|
||||||
|
# 4) yt-dlp last resort — battle-tested extractory dla streamtape, dood, mixdrop,
|
||||||
|
# filemoon, voe, vidoza, etc. Nie używamy go domyślnie (slow + lots of HTTP),
|
||||||
|
# tylko gdy nasze własne metody zawiodły.
|
||||||
|
ytdlp_url = _try_ytdlp_hoster(iframe_url, timeout=timeout)
|
||||||
|
if ytdlp_url:
|
||||||
|
return ytdlp_url
|
||||||
|
|
||||||
|
log.warning(
|
||||||
|
"hoster %s: no video URL in embed (packer unpack=%s, yt-dlp fail)",
|
||||||
|
iframe_url,
|
||||||
|
unpacked is not None,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _try_ytdlp_hoster(iframe_url: str, *, timeout: float) -> str | None:
|
||||||
|
"""yt-dlp wrapper dla hosters których nasz P.A.C.K.E.R. unpacker nie ogarnął.
|
||||||
|
|
||||||
|
yt-dlp ma extractory dla popularnych hosterów (streamtape, dood, mixdrop, filemoon,
|
||||||
|
voe, vidoza, streamwish, ...) — bezpośredni dostęp do `_extract_info`. Te extractory
|
||||||
|
robią multi-step AJAX / token rotation / regex unpacking dla każdego hostera.
|
||||||
|
|
||||||
|
Catch-all exception handling: jeśli yt-dlp nie ma extractora dla tego hostera lub
|
||||||
|
coś się sypie (timeout, anti-bot blokada, format change), wracamy None i caller
|
||||||
|
spadnie do hoster-fallback (mobile WebView).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from yt_dlp import YoutubeDL
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ydl_opts = {
|
||||||
|
"quiet": True,
|
||||||
|
"no_warnings": True,
|
||||||
|
"skip_download": True,
|
||||||
|
"noplaylist": True,
|
||||||
|
"socket_timeout": int(timeout),
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
with YoutubeDL(ydl_opts) as ydl:
|
||||||
|
info = ydl.extract_info(iframe_url, download=False)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("yt-dlp hoster fallback failed for %s: %s", iframe_url, type(e).__name__)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if info is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _looks_like_video_url(u: str | None) -> bool:
|
||||||
|
if not u:
|
||||||
|
return False
|
||||||
|
if _looks_like_ad(u):
|
||||||
|
return False
|
||||||
|
low = u.lower()
|
||||||
|
# Standardowe formaty video. yt-dlp generic czasem zwraca page URL jako
|
||||||
|
# "info[url]" gdy nie rozpoznał stream'a (np. xtremestream.xyz player
|
||||||
|
# bez KVS markers). Bez tego checka extractor wracał iframe URL jako
|
||||||
|
# "stream", mobile próbował go odtwarzać przez ExoPlayer i dostawał
|
||||||
|
# "fake video" lub błąd (zgłoszone 2026-05-10 #30c4d3cf perverzija).
|
||||||
|
return any(ext in low for ext in (".m3u8", ".mp4", ".mpd", ".webm", ".ts"))
|
||||||
|
|
||||||
|
# Best video format URL — yt-dlp już rankuje formats, pierwszy w `formats` zwykle jest
|
||||||
|
# najlepszy, albo `info["url"]` dla single-format extractorów.
|
||||||
|
formats = info.get("formats") or [info]
|
||||||
|
for fmt in formats:
|
||||||
|
if not isinstance(fmt, dict):
|
||||||
|
continue
|
||||||
|
url = fmt.get("url")
|
||||||
|
if _looks_like_video_url(url):
|
||||||
|
return url
|
||||||
|
# Fallback: top-level URL — ale tylko gdy faktycznie wygląda na video.
|
||||||
|
top = info.get("url")
|
||||||
|
if _looks_like_video_url(top):
|
||||||
|
return top
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["extract_stream_from_hoster", "unpack_packer", "HosterDead"]
|
||||||
6
app/extractors/hosters/__init__.py
Normal file
6
app/extractors/hosters/__init__.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
"""Per-hoster dedicated extractors (mixdrop, voe, luluvid, etc.).
|
||||||
|
|
||||||
|
Dispatched z `app.extractors.hoster.extract_stream_from_hoster` na podstawie
|
||||||
|
URL hostname. Każdy moduł exportuje `extract(iframe_url, *, timeout)` → list[StreamSource]
|
||||||
|
lub None.
|
||||||
|
"""
|
||||||
82
app/extractors/hosters/mixdrop.py
Normal file
82
app/extractors/hosters/mixdrop.py
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
"""Mixdrop embed hoster — P.A.C.K.E.R. eval → MDCore.wurl direct mp4.
|
||||||
|
|
||||||
|
Pattern (verified 2026-05-15 via curl_cffi impersonate=chrome120):
|
||||||
|
1. Fetch `https://mixdrop.my/e/<id>` → 200 z 95KB body, redirect 301 do
|
||||||
|
`https://m1xdrop.bz/e/<id>` (current TLD).
|
||||||
|
2. Body zawiera P.A.C.K.E.R. obfuscated JS block:
|
||||||
|
`eval(function(p,a,c,k,e,d){...}('...packed...',N,N,'...|...'.split('|'),0,{}))`
|
||||||
|
3. yt-dlp's `decode_packed_codes()` rozkrywa do ~390 chars JavaScript:
|
||||||
|
`MDCore.wurl="//a-delivery22.mxcontent.net/v2/<id>.mp4?s=<sig>&e=<exp>&_t=<ts>"`
|
||||||
|
4. URL na `mxcontent.net` zwraca **direct mp4** (Content-Type: video/mp4,
|
||||||
|
Content-Length: ~485MB) — działa z Hetzner VPS IP, brak token IP-bind.
|
||||||
|
|
||||||
|
`s` to signed token (HMAC?), `e` to expiry timestamp (unix sec), `_t` to
|
||||||
|
issued timestamp. Token jest valid ~24h od `_t`. Refetching embed page po
|
||||||
|
expiry zwraca nowy URL.
|
||||||
|
|
||||||
|
Active mango movies: 203 playbacks origin='mangoporn:mixdrop' w DB.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.extractors._fetch import browser_get
|
||||||
|
from app.extractors._models import StreamSource
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_PACKER_RE = re.compile(
|
||||||
|
r"eval\(function\(p,a,c,k,e,d\)\{.+?\}\(.+?\)\)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
_MP4_URL_RE = re.compile(r'MDCore\.wurl\s*=\s*"([^"]+\.mp4[^"]*)"')
|
||||||
|
|
||||||
|
|
||||||
|
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
|
||||||
|
res = browser_get(page_url, timeout=timeout)
|
||||||
|
if res.status_code != 200 or not res.text:
|
||||||
|
log.info("mixdrop: fetch fail status=%s url=%s", res.status_code, page_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
m = _PACKER_RE.search(res.text)
|
||||||
|
if not m:
|
||||||
|
log.info("mixdrop: no P.A.C.K.E.R. block in %s (page changed?)", page_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from yt_dlp.utils import decode_packed_codes
|
||||||
|
decoded = decode_packed_codes(m.group(0))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("mixdrop: decode_packed_codes failed: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
url_match = _MP4_URL_RE.search(decoded)
|
||||||
|
if not url_match:
|
||||||
|
log.info("mixdrop: no MDCore.wurl in decoded payload (len=%d)", len(decoded))
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_url = url_match.group(1)
|
||||||
|
# URL z mixdrop często jest protocol-relative (`//a-delivery22...`).
|
||||||
|
if raw_url.startswith("//"):
|
||||||
|
raw_url = "https:" + raw_url
|
||||||
|
|
||||||
|
return [
|
||||||
|
StreamSource(
|
||||||
|
link=raw_url,
|
||||||
|
quality=None, # mixdrop nie listuje quality variants w MDCore
|
||||||
|
type="mp4",
|
||||||
|
referer="https://mixdrop.my/",
|
||||||
|
# mxcontent CDN wymaga **same-session cookies** z embed page +
|
||||||
|
# Chrome JA3. Backend `extract` zamyka sesję po fetch → mobile
|
||||||
|
# próbuje mp4 bez cookies → 403. Proxy MUSI re-fetchować embed
|
||||||
|
# w fresh curl_cffi session, extract nowy mp4 URL, stream.
|
||||||
|
# `refetch_url` w raw → token field `rf` → proxy refresh logic.
|
||||||
|
raw={
|
||||||
|
"proxy_impersonate": True,
|
||||||
|
"refetch_url": page_url, # embed page do re-extract
|
||||||
|
"refetch_hoster": "mixdrop",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
]
|
||||||
153
app/extractors/hosters/seekplayer_engine.py
Normal file
153
app/extractors/hosters/seekplayer_engine.py
Normal file
|
|
@ -0,0 +1,153 @@
|
||||||
|
"""Common engine extractor for: embedseek, seekplayer, rpmplay, upns, player4me, easyvidplayer.
|
||||||
|
|
||||||
|
Wszyscy używają tego samego silnika (Vite-built React SPA + AES-CBC encrypted API
|
||||||
|
+ HLS-based streaming). Hostname domains different ale shared backend.
|
||||||
|
|
||||||
|
Pattern (verified 2026-05-15 z residential PL + VPS Hetzner FI):
|
||||||
|
|
||||||
|
1. Embed URL = `https://<sub>.<host>.<tld>/#<hash_id>` — hash fragment to video ID.
|
||||||
|
SPA shell `Loading...` body load'uje `/assets/index-<n>.js` bundle.
|
||||||
|
|
||||||
|
2. JS fetcha `/api/v1/video?id=<hash_id>&w=<W>&h=<H>&r=` (W,H z window.screen).
|
||||||
|
Response: hex-encoded AES-CBC(key=`kiemtienmua911ca`, iv=`1234567890oiuytr`)
|
||||||
|
ciphertext, ~5KB. PKCS7 padded.
|
||||||
|
|
||||||
|
3. Plaintext JSON zawiera:
|
||||||
|
- `source`: signed m3u8 URL na CDN edge IP (np. `185.237.107.146/v4/<sig>/<exp>/ty/<hash>/master.m3u8?v=...`)
|
||||||
|
- `cf`: Cloudflare-fronted fallback URL (.txt z listą m3u8 paths)
|
||||||
|
- `metric.ipAddress`: IP visitora (signed token IP-bound do tego IP)
|
||||||
|
- `metric.cfDomain`: CF domain dla fallback
|
||||||
|
- `title`, `poster`, `thumbnail`, ...
|
||||||
|
|
||||||
|
4. `source` URL jest signed z visitor IP. Z VPS fetch zwraca master.m3u8 z signed
|
||||||
|
token tied to VPS IP — proxy fetcha segments z tym samym tokenem, działa.
|
||||||
|
CDN port 443 z `verify=False` (self-signed IP cert).
|
||||||
|
|
||||||
|
5. Wszystkie hostery share te same wartości KEY/IV. Wewnętrzna obfuskacja JS
|
||||||
|
maskuje to lookupem `ue(773)`, `ue(686)` itp. — derived bytes są zawsze
|
||||||
|
identyczne dla każdej domeny.
|
||||||
|
|
||||||
|
Hostery covered (origin counts w DB, 2026-05-15):
|
||||||
|
- embedseek (20271), seekplayer (20271) — mirror sites, dzielą hash_id
|
||||||
|
- rpmplay (15317)
|
||||||
|
- upns (14287)
|
||||||
|
- player4me (41040)
|
||||||
|
- easyvidplayer (47588)
|
||||||
|
|
||||||
|
Razem ~159k playback sources.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from cryptography.hazmat.primitives import padding
|
||||||
|
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||||
|
|
||||||
|
from app.extractors._fetch import _DEFAULT_UA, browser_get
|
||||||
|
from app.extractors._models import HosterDead, StreamSource
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_KEY = b"kiemtienmua911ca"
|
||||||
|
_IV = b"1234567890oiuytr"
|
||||||
|
|
||||||
|
# Hostname matching: 6 base hosts × subdomains × TLD variants.
|
||||||
|
# Examples:
|
||||||
|
# my.embedseek.online, vip.seekplayer.vip, my.rpmplay.online,
|
||||||
|
# my.upns.online, vip.player4me.vip, p.easyvidplayer.com
|
||||||
|
_HOST_RE = re.compile(
|
||||||
|
r"^(?:[a-z0-9]+\.)?(?:embedseek|seekplayer|rpmplay|upns|player4me|easyvidplayer)\."
|
||||||
|
r"(?:online|vip|com|net|io|me|tv)$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def matches(url: str) -> bool:
|
||||||
|
try:
|
||||||
|
host = urlparse(url).hostname or ""
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
return bool(_HOST_RE.match(host))
|
||||||
|
|
||||||
|
|
||||||
|
def _decrypt(hex_str: str) -> str:
|
||||||
|
ct = bytes.fromhex(hex_str)
|
||||||
|
cipher = Cipher(algorithms.AES(_KEY), modes.CBC(_IV))
|
||||||
|
dec = cipher.decryptor()
|
||||||
|
pt = dec.update(ct) + dec.finalize()
|
||||||
|
unpadder = padding.PKCS7(128).unpadder()
|
||||||
|
return (unpadder.update(pt) + unpadder.finalize()).decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
|
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
|
||||||
|
parsed = urlparse(page_url)
|
||||||
|
if not parsed.hostname or not _HOST_RE.match(parsed.hostname):
|
||||||
|
return None
|
||||||
|
# hash_id w `#<id>` fragmencie; gdy klient przekazał bez `#` (np. po nav.replace),
|
||||||
|
# spróbujemy też `?id=` query param.
|
||||||
|
hash_id = parsed.fragment.strip()
|
||||||
|
if not hash_id and parsed.query:
|
||||||
|
from urllib.parse import parse_qs
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
hash_id = (qs.get("id") or [""])[0]
|
||||||
|
if not hash_id:
|
||||||
|
log.info("seekplayer-engine: no hash_id w %s", page_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
host = f"{parsed.scheme}://{parsed.hostname}"
|
||||||
|
api_url = f"{host}/api/v1/video?id={hash_id}&w=1920&h=1080&r="
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": _DEFAULT_UA,
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Referer": f"{host}/",
|
||||||
|
}
|
||||||
|
r = browser_get(api_url, headers=headers, timeout=timeout)
|
||||||
|
if r.status_code in (404, 410):
|
||||||
|
raise HosterDead(f"seekplayer-engine {page_url}: HTTP {r.status_code}")
|
||||||
|
if r.status_code != 200 or not r.text:
|
||||||
|
log.info("seekplayer-engine: api fail %s status=%s", api_url, r.status_code)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
plaintext = _decrypt(r.text)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("seekplayer-engine: decrypt fail dla %s: %s", api_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(plaintext)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("seekplayer-engine: JSON parse fail dla %s: %s", api_url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Hostery same-engine wracają `{"error": "..."}` gdy video nie istnieje.
|
||||||
|
if isinstance(data, dict) and data.get("error"):
|
||||||
|
raise HosterDead(f"seekplayer-engine {page_url}: {data['error']}")
|
||||||
|
|
||||||
|
source = (data.get("source") or "").strip()
|
||||||
|
cf = (data.get("cf") or "").strip()
|
||||||
|
|
||||||
|
# Source: IP-bound m3u8 URL na CDN edge (np. `185.237.107.146/v4/<sig>/<exp>/ty/<hash>/master.m3u8`).
|
||||||
|
# Token signed dla VPS IP — proxy poda segmenty z tego samego IP, OK.
|
||||||
|
# CDN servuje cert na IP — fetch wymaga verify=False (stream_proxy.py ma już
|
||||||
|
# taką gałąź dla IP-host m3u8).
|
||||||
|
sources: list[StreamSource] = []
|
||||||
|
if source:
|
||||||
|
sources.append(
|
||||||
|
StreamSource(
|
||||||
|
link=source,
|
||||||
|
quality=None,
|
||||||
|
type="m3u8",
|
||||||
|
referer=f"{host}/",
|
||||||
|
raw={
|
||||||
|
"proxy_no_verify": True,
|
||||||
|
"cf_fallback": cf or None,
|
||||||
|
"engine": "seekplayer",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return sources or None
|
||||||
117
app/extractors/hosters/streamtape.py
Normal file
117
app/extractors/hosters/streamtape.py
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
"""Streamtape embed → direct mp4 extractor.
|
||||||
|
|
||||||
|
Pattern (verified 2026-05-15 z residential, live URL `/e/PZqBZp4OomF0Q61`):
|
||||||
|
|
||||||
|
1. Embed `/e/<id>` zwraca 89KB body z 4 `document.getElementById(...).innerHTML`
|
||||||
|
assignmentami konstruującymi pełen URL do `/get_video`. Każdy uses ten sam
|
||||||
|
pattern:
|
||||||
|
|
||||||
|
document.getElementById('robotlink').innerHTML =
|
||||||
|
'//streamtape.com/get_video' +
|
||||||
|
('<junk>?id=<id>&expires=...&ip=...&token=...').substring(N).substring(M);
|
||||||
|
|
||||||
|
Junk to 3-4 znaki przed `?` — substring(N).substring(M) je odcina.
|
||||||
|
|
||||||
|
2. Po sklejeniu fetch `https://streamtape.com/get_video?id=...&token=...` →
|
||||||
|
302 → `https://<cluster>.tapecontent.net/radosgw/<id>/<signed_path>/<title>.mp4`
|
||||||
|
(direct mp4, video/mp4 ~500MB, brak IP-bind).
|
||||||
|
|
||||||
|
3. Body czasem zwraca `Video not found! Maybe it got deleted by the creator!`
|
||||||
|
— większość URLów w naszej DB (12k mass-DMCA'd 2026-05-15). Wtedy raise
|
||||||
|
HosterDead, caller w playback.py oznaczy dead_at.
|
||||||
|
|
||||||
|
Live URL coverage probed 2026-05-15: ~5% URLów żyje, reszta `Video not found`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.extractors._fetch import _DEFAULT_UA, browser_get
|
||||||
|
from app.extractors._models import HosterDead, StreamSource
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Match: `getElementById('xlink').innerHTML = "<prefix>" + '' + ('<suffix>').substring(N).substring(M);`
|
||||||
|
# Streamtape generuje 4 assignmenty (ideoolink x2 + botlink + robotlink) — 2 są DECOYs
|
||||||
|
# z połamanym hostname (`.comb`, `.cob`) i tylko botlink/robotlink dają prawdziwy URL.
|
||||||
|
# Prefix może być fragmentem: `/streamtape.com`, `//streamtape.co`, `//streamtape.com/g`
|
||||||
|
# — `get_video` często jest split między prefix i suffix po slice'ach. Decyzja na
|
||||||
|
# podstawie KOMBINOWANEGO output containing exact `streamtape.com/get_video?`.
|
||||||
|
_ASSIGN_RE = re.compile(
|
||||||
|
r"document\.getElementById\(['\"](?P<elem>[a-z]+link)['\"]\)\.innerHTML\s*=\s*"
|
||||||
|
r"['\"](?P<prefix>[^'\"]*streamtape[^'\"]*)['\"]"
|
||||||
|
r"\s*\+\s*(?:['\"]{2}\s*\+\s*)?"
|
||||||
|
r"\(['\"](?P<suffix>[^'\"]+)['\"]\)"
|
||||||
|
r"(?P<slices>(?:\.substring\(\d+\))+)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_SUBSTRING_RE = re.compile(r"\.substring\((\d+)\)")
|
||||||
|
_NOT_FOUND_RE = re.compile(r"Video\s+not\s+found", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_slices(suffix: str, slices_str: str) -> str:
|
||||||
|
out = suffix
|
||||||
|
for m in _SUBSTRING_RE.finditer(slices_str):
|
||||||
|
n = int(m.group(1))
|
||||||
|
out = out[n:]
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract(page_url: str, *, timeout: float = 30.0) -> list[StreamSource] | None:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": _DEFAULT_UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
r = browser_get(page_url, headers=headers, timeout=timeout)
|
||||||
|
if r.status_code in (404, 410):
|
||||||
|
raise HosterDead(f"streamtape {page_url}: HTTP {r.status_code}")
|
||||||
|
if r.status_code != 200 or not r.text:
|
||||||
|
log.info("streamtape: fetch fail %s status=%s", page_url, r.status_code)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if _NOT_FOUND_RE.search(r.text):
|
||||||
|
raise HosterDead(f"streamtape {page_url}: Video not found")
|
||||||
|
|
||||||
|
# Spróbuj wszystkie 4 assignmenty — pierwszy poprawny URL wygrywa.
|
||||||
|
# `get_video` może być w prefix (residential variant) lub split prefix+suffix
|
||||||
|
# (VPS variant gdzie decoy assignmenty produkują `.comb/get_video`).
|
||||||
|
final_url: str | None = None
|
||||||
|
for m in _ASSIGN_RE.finditer(r.text):
|
||||||
|
prefix = m.group("prefix").strip()
|
||||||
|
suffix = m.group("suffix")
|
||||||
|
slices = m.group("slices")
|
||||||
|
tail = _apply_slices(suffix, slices)
|
||||||
|
combined = prefix + tail
|
||||||
|
# Normalize: dodaj `https:` jeśli URL zaczyna się od `//`
|
||||||
|
if combined.startswith("//"):
|
||||||
|
url = "https:" + combined
|
||||||
|
elif combined.startswith("/"):
|
||||||
|
url = "https:/" + combined # `/streamtape.com/...` → `https://streamtape.com/...`
|
||||||
|
else:
|
||||||
|
url = combined
|
||||||
|
# Walidacja — odsiewa decoys (`streamtape.comb`, `streamtape.cob`).
|
||||||
|
if (
|
||||||
|
"streamtape.com/get_video?" in url
|
||||||
|
and "id=" in url
|
||||||
|
and "token=" in url
|
||||||
|
):
|
||||||
|
final_url = url
|
||||||
|
break
|
||||||
|
|
||||||
|
if not final_url:
|
||||||
|
log.info("streamtape: no valid innerHTML assignment found in %s", page_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return [
|
||||||
|
StreamSource(
|
||||||
|
link=final_url,
|
||||||
|
quality=None,
|
||||||
|
type="mp4",
|
||||||
|
referer=page_url,
|
||||||
|
# /get_video zwraca 302 do tapecontent.net direct mp4. Proxy musi
|
||||||
|
# follow redirect (stream_proxy domyślnie follow_redirects=True).
|
||||||
|
raw={"redirect_via": "streamtape_get_video"},
|
||||||
|
)
|
||||||
|
]
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue