Three orphan-factory tubes (0–0.2% canonical match — auto-screenshot thumbs and slug titles that never match TPDB/StashDB) — to be replaced by better sources. Removed scrapers (files + imports), extractors (registry + modules), the pornhat entry from tag-enrichment priority lists and the 0dayxx display override, and purged the DB (19,003 playback_sources + 9,904 solo-orphan scenes; shared mirror scenes keep their other sources). The pornhat-based enrich_studio endpoint stays as a graceful no-op (no pornhat sources → returns no studio). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
174 lines
6.2 KiB
Python
174 lines
6.2 KiB
Python
"""GET /sources — lista tube źródeł dla feature "Sites" (mobile top-level tab).
|
|
|
|
Bug-report 2026-05-24 (ea6f05f9, Scenes screen): user chce wybrać "pages"
|
|
obok Scenes i Movies — widzieć liście tube'ów i wchodzić w nie żeby zobaczyć
|
|
najnowsze sceny z konkretnego źródła.
|
|
|
|
Endpoint enumeruje distinct `playback_sources.origin` z ŻYWYCH playback_sources
|
|
(`dead_at IS NULL`), tylko origins zaczynające się od 'tube:' (kanoniczne źródła
|
|
typu `canonical:tpdb_trailer` są pomijane — to nie są "scrapowane strony" w sensie
|
|
intencji feature'a).
|
|
|
|
Sortowanie: scene_count DESC (najbardziej "wypełnione" tubey na górze).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import func, select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.auth import require_api_key
|
|
from app.db import get_session
|
|
from app.models.playback_source import PlaybackSource
|
|
from app.models.source_stats import SourceStats
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/sources", tags=["sources"], dependencies=[Depends(require_api_key)])
|
|
|
|
|
|
class SourceRating(BaseModel):
|
|
"""Ocena źródła do rankingu na Sites screen. Pola = osie z user-requesta."""
|
|
stars: int
|
|
"""Ogólna ocena 0-5 (0 = offline). Główny sygnał do sortowania/wyświetlenia."""
|
|
freshness: int
|
|
"""0-5: jak często wpada nowy content (wiek najnowszej + wolumen 7d)."""
|
|
richness: int
|
|
"""0-5: bogactwo metadanych (miniaturka/tagi/desc/aktorzy/studio/długość)."""
|
|
health: int | None
|
|
"""0-5: czy realnie gra (telemetria odtwarzania) — 0=offline. None gdy brak danych."""
|
|
health_basis: str | None = None
|
|
"""'telemetry' (realne pingi z apki) albo 'proxy' (oszacowanie z typu resolve)."""
|
|
components: dict | None = None
|
|
"""Surowe składowe do rozkładu w UI (% per pole, success-rate, ttff)."""
|
|
|
|
|
|
class SourceOut(BaseModel):
|
|
origin: str
|
|
"""Raw origin string z DB — np. 'tube:hqpornercom'. Używany jako parametr
|
|
`origin=` filtra w GET /scenes (substring match)."""
|
|
|
|
sitetag: str
|
|
"""Origin bez prefiksu 'tube:' — np. 'hqpornercom'. Stabilne ID tube'a (zgodne
|
|
z `BaseDirectTubeScraper.sitetag`)."""
|
|
|
|
display_name: str
|
|
"""Czytelna nazwa do UI — np. 'hqporner.com'. Wyprowadzona z sitetag przez
|
|
`_sitetag_to_display`. Tylko presentation; logikę trzymamy na sitetag/origin."""
|
|
|
|
scene_count: int
|
|
"""Liczba ŻYWYCH playback_sources (dead_at IS NULL) per origin. Approx scenes
|
|
coverage — scena może mieć wiele sources tego samego origin (różne page_url),
|
|
więc trochę zawyża rzeczywistą scene-distinct count, ale dla orientacji OK."""
|
|
|
|
last_scraped_at: datetime | None
|
|
"""MAX(last_seen_at) — najświeższy scrape dla tego origin. Pozwala mobile pokazać
|
|
'scrapowane Xh temu' i sortować świeżość."""
|
|
|
|
rating: SourceRating | None = None
|
|
"""Ocena 0-5★ (freshness/richness/health) z source_stats — None gdy jeszcze
|
|
nie policzona (job source-stats leci co kilka h)."""
|
|
|
|
|
|
class SourceListOut(BaseModel):
|
|
items: list[SourceOut]
|
|
total: int
|
|
|
|
|
|
# Hardcoded display-name overrides dla edge cases. Większość sitetags mapuje się
|
|
# czysto `_sitetag_to_display` regex'em (`hqpornercom` → `hqporner.com`), ale niektóre
|
|
# tubey mają nietypowe TLDs / brakujące kropki w sitetag.
|
|
_DISPLAY_OVERRIDES: dict[str, str] = {
|
|
"fpoxxx": "fpo.xxx",
|
|
"siskavideo": "siska.video",
|
|
"porn4dayspw": "porn4days.pw",
|
|
"porn00org": "porn00.org",
|
|
"freshpornoorg": "freshporno.org",
|
|
"pornxpph": "pornxp.ph",
|
|
"shyfapnet": "shyfap.net",
|
|
"hdporngg": "hdporn.gg",
|
|
"fullmoviesxxx": "fullmovies.xxx",
|
|
"latestleaksco": "latestleaks.co",
|
|
"xxxfreewatch": "xxxfreewatch.com",
|
|
"watchporn": "watchporn.to",
|
|
}
|
|
|
|
|
|
_TLD_RE = re.compile(r"^(.+?)(com|org|net|info)$")
|
|
|
|
|
|
def _sitetag_to_display(sitetag: str) -> str:
|
|
"""`hqpornercom` → `hqporner.com`. Fallback dla mainstream tube'ów."""
|
|
if sitetag in _DISPLAY_OVERRIDES:
|
|
return _DISPLAY_OVERRIDES[sitetag]
|
|
m = _TLD_RE.match(sitetag)
|
|
if m:
|
|
return f"{m.group(1)}.{m.group(2)}"
|
|
return sitetag
|
|
|
|
|
|
@router.get("", response_model=SourceListOut)
|
|
def list_sources(
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> SourceListOut:
|
|
"""Zwraca listę tube źródeł z ŻYWYMI playback_sources.
|
|
|
|
Filter: `origin LIKE 'tube:%'` (drop canonical:* — TPDB trailery to inna semantyka).
|
|
"""
|
|
rows = session.execute(
|
|
select(
|
|
PlaybackSource.origin,
|
|
func.count(PlaybackSource.id).label("scene_count"),
|
|
func.max(PlaybackSource.last_seen_at).label("last_scraped_at"),
|
|
)
|
|
.where(PlaybackSource.dead_at.is_(None))
|
|
.where(PlaybackSource.origin.like("tube:%"))
|
|
.group_by(PlaybackSource.origin)
|
|
).all()
|
|
|
|
# Oceny z source_stats (policzone offline przez run_source_stats). Origin → row.
|
|
stats = {s.origin: s for s in session.execute(select(SourceStats)).scalars().all()}
|
|
|
|
items: list[SourceOut] = []
|
|
for origin, scene_count, last_scraped_at in rows:
|
|
sitetag = origin.split(":", 1)[1] if origin.startswith("tube:") else origin
|
|
st = stats.get(origin)
|
|
rating = (
|
|
SourceRating(
|
|
stars=st.stars,
|
|
freshness=st.freshness,
|
|
richness=st.richness,
|
|
health=st.health,
|
|
health_basis=(st.components or {}).get("health_basis"),
|
|
components=st.components,
|
|
)
|
|
if st is not None
|
|
else None
|
|
)
|
|
items.append(
|
|
SourceOut(
|
|
origin=origin,
|
|
sitetag=sitetag,
|
|
display_name=_sitetag_to_display(sitetag),
|
|
scene_count=scene_count,
|
|
last_scraped_at=last_scraped_at,
|
|
rating=rating,
|
|
)
|
|
)
|
|
|
|
# Sort: najpierw ocena (stars desc, źródła bez oceny na końcu), potem rozmiar.
|
|
items.sort(
|
|
key=lambda it: (
|
|
it.rating.stars if it.rating else -1,
|
|
it.scene_count,
|
|
),
|
|
reverse=True,
|
|
)
|
|
|
|
return SourceListOut(items=items, total=len(items))
|