- app/api/seo.py (+ app/templates/seo/*): publiczny HTML SEO router (programmatic entity long-tail: performer/studio/scene/landing/2257), bez api-key. Importowany przez main.py — wymagany do uruchomienia, dotąd untracked. Opsec-clean (brak VPS IP/sekretów). - CLAUDE.md: instrukcje projektu (dotąd untracked). - .gitignore: .nimbalyst/ (lokalne tracker-tooling, nie dla OSS repo). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
513 lines
17 KiB
Python
513 lines
17 KiB
Python
"""Publiczna, crawlowalna powierzchnia SEO — programmatic entity pages.
|
|
|
|
Jedyny publiczny router HTML poza /static i healthchecks — NIE wymaga api key,
|
|
bo Googlebot/użytkownik musi dotrzeć bez tokenu. Cel: łapać nawigacyjny long-tail
|
|
(nazwy performerów / studiów / tytuły scen), którego mainstream-SEO nie indeksuje,
|
|
a który realnie generuje organiczny ruch → pobranie apki.
|
|
|
|
Zasady (świadome, nie przypadkowe):
|
|
|
|
* **LINK-OUT only.** Strony renderują metadane i odsyłają do źródeł (`page_url`).
|
|
NIE eksponujemy `stream_url`/`embed_url` — to zostaje value-add apki i trzyma
|
|
legalny profil "agregatora/wyszukiwarki" (dowozimy ruch tubom, nie re-streamujemy).
|
|
* **Age-gate = client-side overlay** (cookie `age_ok`). Treść jest w HTML, więc
|
|
crawler ją indeksuje; overlay zasłania ją tylko ludziom do potwierdzenia 18+.
|
|
Dodatkowo `RTA` meta tag dla filtrów rodzicielskich.
|
|
* **Blacklist respektowany** — te same wykluczenia performer/studio/tag co w /scenes.
|
|
Nie publikujemy SEO-stron dla zblacklistowanej treści.
|
|
* **Anti-thin-page** — strona encji powstaje tylko gdy ma realną zawartość
|
|
(performer/studio z ≥1 żywą sceną; scena z ≥1 żywym source). Pusta encja → 404,
|
|
nie pusty doorway (Google karze masowe cienkie strony).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import uuid
|
|
from datetime import date
|
|
from pathlib import Path
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
|
from fastapi.responses import HTMLResponse, PlainTextResponse, Response
|
|
from fastapi.templating import Jinja2Templates
|
|
from sqlalchemy import exists, func, select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.api.scenes import _needs_proxy, _wrap_image_proxy
|
|
from app.db import get_session
|
|
from app.models.blacklist import (
|
|
BlacklistedPerformer,
|
|
BlacklistedStudio,
|
|
BlacklistedTag,
|
|
)
|
|
from app.models.performer import Performer, PerformerAlias
|
|
from app.models.playback_source import PlaybackSource
|
|
from app.models.scene import Scene, ScenePerformer, SceneTag
|
|
from app.models.studio import Studio
|
|
from app.models.tag import Tag
|
|
|
|
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
|
|
templates = Jinja2Templates(directory=str(_TEMPLATES_DIR))
|
|
|
|
# Analityka — wstrzykiwana do każdej strony SEO tylko gdy odpowiedni env jest ustawiony.
|
|
# Puste = tag się nie renderuje (zero third-party requestów, zachowanie bez zmian).
|
|
# Włączenie = ustaw zmienną w .env na VPS + restart, bez zmian w kodzie.
|
|
templates.env.globals["gtm_id"] = os.environ.get("GOON_GTM_ID", "")
|
|
templates.env.globals["ga4_id"] = os.environ.get("GOON_GA4_ID", "")
|
|
templates.env.globals["gsc_verify"] = os.environ.get("GOON_GSC_VERIFY", "")
|
|
|
|
# Limit URL-i na pojedynczy plik sitemap (spec: max 50k). Trzymamy z zapasem.
|
|
_SITEMAP_PAGE = 25_000
|
|
# Ile scen renderujemy na stronie encji (performer/studio) — pełna lista 1000+ scen
|
|
# to thin/slow; bierzemy najświeższe N, reszta i tak wpada przez sitemap scen.
|
|
_SCENES_PER_ENTITY = 120
|
|
|
|
router = APIRouter(tags=["seo"])
|
|
|
|
|
|
def base_url() -> str:
|
|
"""Publiczny origin pod którym serwowane są te strony (do canonical/sitemap/OG)."""
|
|
return os.environ.get("BACKEND_PUBLIC_URL", "https://goon-foss.org").rstrip("/")
|
|
|
|
|
|
# --- reużywalne fragmenty zapytań -------------------------------------------------
|
|
|
|
|
|
def _live_playback_exists():
|
|
"""EXISTS: scena ma ≥1 żywy (dead_at IS NULL) playback_source."""
|
|
return exists(
|
|
select(1).where(
|
|
PlaybackSource.scene_id == Scene.id,
|
|
PlaybackSource.dead_at.is_(None),
|
|
)
|
|
)
|
|
|
|
|
|
def _not_blacklisted():
|
|
"""Lista warunków WHERE wykluczających zblacklistowaną treść (performer/studio/tag).
|
|
|
|
Te same reguły co w GET /scenes — żeby SEO nie publikowało tego, co katalog ukrywa.
|
|
"""
|
|
return [
|
|
~exists(
|
|
select(1)
|
|
.select_from(ScenePerformer)
|
|
.join(
|
|
BlacklistedPerformer,
|
|
BlacklistedPerformer.performer_id == ScenePerformer.performer_id,
|
|
)
|
|
.where(ScenePerformer.scene_id == Scene.id)
|
|
),
|
|
~Scene.studio_id.in_(select(BlacklistedStudio.studio_id)),
|
|
~exists(
|
|
select(1)
|
|
.select_from(SceneTag)
|
|
.join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id)
|
|
.where(SceneTag.scene_id == Scene.id)
|
|
),
|
|
]
|
|
|
|
|
|
def _indexable_scenes():
|
|
"""SELECT Scene żywych, nie-zblacklistowanych scen — baza pod listy i sitemap."""
|
|
stmt = select(Scene).where(_live_playback_exists())
|
|
for cond in _not_blacklisted():
|
|
stmt = stmt.where(cond)
|
|
return stmt
|
|
|
|
|
|
def _performer_indexable_exists():
|
|
"""EXISTS: performer ma ≥1 indeksowalną (żywą, nie-blacklisted) scenę.
|
|
|
|
Trzyma sitemap performerów w zgodzie z tym, co realnie renderuje
|
|
`performer_page` — inaczej sitemap zgłaszałby URL-e dające 404.
|
|
"""
|
|
sub = (
|
|
select(1)
|
|
.select_from(ScenePerformer)
|
|
.join(Scene, Scene.id == ScenePerformer.scene_id)
|
|
.where(ScenePerformer.performer_id == Performer.id)
|
|
.where(_live_playback_exists())
|
|
)
|
|
for cond in _not_blacklisted():
|
|
sub = sub.where(cond)
|
|
return exists(sub)
|
|
|
|
|
|
def _scene_card_rows(session: Session, scene_ids: list[uuid.UUID]) -> dict[uuid.UUID, dict]:
|
|
"""Batch: dla listy scen zbierz dane do karty (studio name/slug, #źródeł, thumb)."""
|
|
if not scene_ids:
|
|
return {}
|
|
out: dict[uuid.UUID, dict] = {sid: {"sources": 0, "thumb": None} for sid in scene_ids}
|
|
|
|
# liczba żywych źródeł + pierwszy thumbnail (z page_url do proxy referer)
|
|
pb_rows = session.execute(
|
|
select(PlaybackSource.scene_id, PlaybackSource.thumbnail_url, PlaybackSource.page_url)
|
|
.where(
|
|
PlaybackSource.scene_id.in_(scene_ids),
|
|
PlaybackSource.dead_at.is_(None),
|
|
)
|
|
).all()
|
|
for sid, thumb, page_url in pb_rows:
|
|
out[sid]["sources"] += 1
|
|
if out[sid]["thumb"] is None and thumb:
|
|
if _needs_proxy(thumb):
|
|
thumb = _wrap_image_proxy(thumb, page_url)
|
|
out[sid]["thumb"] = thumb
|
|
return out
|
|
|
|
|
|
def _iso_duration(seconds: int | None) -> str | None:
|
|
"""sekundy → ISO-8601 (PT#M#S) dla schema.org VideoObject.duration."""
|
|
if not seconds or seconds <= 0:
|
|
return None
|
|
m, s = divmod(int(seconds), 60)
|
|
return f"PT{m}M{s}S"
|
|
|
|
|
|
# --- strony encji -----------------------------------------------------------------
|
|
|
|
|
|
@router.get("/p/{slug}", response_class=HTMLResponse)
|
|
def performer_page(
|
|
slug: str,
|
|
request: Request,
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> HTMLResponse:
|
|
performer = session.execute(
|
|
select(Performer).where(Performer.slug == slug)
|
|
).scalar_one_or_none()
|
|
if performer is None:
|
|
raise HTTPException(status_code=404, detail="performer not found")
|
|
|
|
# Sceny tej osoby — żywe, nie-blacklisted, najświeższe pierwsze.
|
|
scenes = (
|
|
session.execute(
|
|
_indexable_scenes()
|
|
.where(
|
|
exists(
|
|
select(1).where(
|
|
ScenePerformer.scene_id == Scene.id,
|
|
ScenePerformer.performer_id == performer.id,
|
|
)
|
|
)
|
|
)
|
|
.order_by(Scene.release_date.desc().nullslast(), Scene.created_at.desc())
|
|
.limit(_SCENES_PER_ENTITY)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
if not scenes:
|
|
# Pusta encja → 404 zamiast thin doorway.
|
|
raise HTTPException(status_code=404, detail="no indexable scenes for performer")
|
|
|
|
cards = _scene_card_rows(session, [s.id for s in scenes])
|
|
studios = {
|
|
st.id: st
|
|
for st in session.execute(
|
|
select(Studio).where(
|
|
Studio.id.in_({s.studio_id for s in scenes if s.studio_id})
|
|
)
|
|
).scalars()
|
|
}
|
|
aliases = [
|
|
a.alias
|
|
for a in session.execute(
|
|
select(PerformerAlias).where(PerformerAlias.performer_id == performer.id)
|
|
).scalars()
|
|
]
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/performer.html",
|
|
{
|
|
"base_url": base_url(),
|
|
"performer": performer,
|
|
"aliases": sorted({a for a in aliases if a.lower() != performer.canonical_name.lower()}),
|
|
"scenes": scenes,
|
|
"cards": cards,
|
|
"studios": studios,
|
|
"canonical": f"{base_url()}/p/{performer.slug}",
|
|
},
|
|
)
|
|
|
|
|
|
@router.get("/studio/{slug}", response_class=HTMLResponse)
|
|
def studio_page(
|
|
slug: str,
|
|
request: Request,
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> HTMLResponse:
|
|
studio = session.execute(
|
|
select(Studio).where(Studio.slug == slug)
|
|
).scalar_one_or_none()
|
|
if studio is None:
|
|
raise HTTPException(status_code=404, detail="studio not found")
|
|
|
|
scenes = (
|
|
session.execute(
|
|
_indexable_scenes()
|
|
.where(Scene.studio_id == studio.id)
|
|
.order_by(Scene.release_date.desc().nullslast(), Scene.created_at.desc())
|
|
.limit(_SCENES_PER_ENTITY)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
if not scenes:
|
|
raise HTTPException(status_code=404, detail="no indexable scenes for studio")
|
|
|
|
cards = _scene_card_rows(session, [s.id for s in scenes])
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/studio.html",
|
|
{
|
|
"base_url": base_url(),
|
|
"studio": studio,
|
|
"scenes": scenes,
|
|
"cards": cards,
|
|
"canonical": f"{base_url()}/studio/{studio.slug}",
|
|
},
|
|
)
|
|
|
|
|
|
@router.get("/scene/{scene_id}", response_class=HTMLResponse)
|
|
def scene_page(
|
|
scene_id: uuid.UUID,
|
|
request: Request,
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> HTMLResponse:
|
|
scene = session.get(Scene, scene_id)
|
|
if scene is None:
|
|
raise HTTPException(status_code=404, detail="scene not found")
|
|
|
|
# Źródła — żywe, deduplikowane po origin (pokazujemy 1 link per tube).
|
|
sources_raw = (
|
|
session.execute(
|
|
select(PlaybackSource)
|
|
.where(
|
|
PlaybackSource.scene_id == scene.id,
|
|
PlaybackSource.dead_at.is_(None),
|
|
)
|
|
.order_by(PlaybackSource.origin.asc())
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
if not sources_raw:
|
|
raise HTTPException(status_code=404, detail="scene has no live sources")
|
|
|
|
seen_origins: set[str] = set()
|
|
sources = []
|
|
thumb: str | None = None
|
|
for s in sources_raw:
|
|
if thumb is None and s.thumbnail_url:
|
|
thumb = _wrap_image_proxy(s.thumbnail_url, s.page_url) if _needs_proxy(s.thumbnail_url) else s.thumbnail_url
|
|
if s.origin in seen_origins:
|
|
continue
|
|
seen_origins.add(s.origin)
|
|
label = s.origin.split(":", 1)[1] if ":" in s.origin else s.origin
|
|
sources.append({"label": label, "page_url": s.page_url, "quality": s.quality})
|
|
|
|
studio = session.get(Studio, scene.studio_id) if scene.studio_id else None
|
|
performers = (
|
|
session.execute(
|
|
select(Performer)
|
|
.join(ScenePerformer, ScenePerformer.performer_id == Performer.id)
|
|
.where(ScenePerformer.scene_id == scene.id)
|
|
.order_by(ScenePerformer.position.asc().nullslast())
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
tags = (
|
|
session.execute(
|
|
select(Tag)
|
|
.join(SceneTag, SceneTag.tag_id == Tag.id)
|
|
.where(SceneTag.scene_id == scene.id)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/scene.html",
|
|
{
|
|
"base_url": base_url(),
|
|
"scene": scene,
|
|
"studio": studio,
|
|
"performers": performers,
|
|
"tags": tags,
|
|
"sources": sources,
|
|
"thumb": thumb,
|
|
"iso_duration": _iso_duration(scene.duration_sec),
|
|
"canonical": f"{base_url()}/scene/{scene.id}",
|
|
},
|
|
)
|
|
|
|
|
|
@router.get("/", response_class=HTMLResponse)
|
|
def landing(
|
|
request: Request,
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> HTMLResponse:
|
|
"""Strona główna — crawl-entry. Najświeższe indeksowalne sceny + CTA."""
|
|
scenes = (
|
|
session.execute(
|
|
_indexable_scenes()
|
|
.order_by(Scene.created_at.desc())
|
|
.limit(48)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
cards = _scene_card_rows(session, [s.id for s in scenes])
|
|
studios = {
|
|
st.id: st
|
|
for st in session.execute(
|
|
select(Studio).where(Studio.id.in_({s.studio_id for s in scenes if s.studio_id}))
|
|
).scalars()
|
|
}
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/landing.html",
|
|
{
|
|
"base_url": base_url(),
|
|
"scenes": scenes,
|
|
"cards": cards,
|
|
"studios": studios,
|
|
"canonical": f"{base_url()}/",
|
|
},
|
|
)
|
|
|
|
|
|
@router.get("/get", response_class=HTMLResponse)
|
|
def get_app(request: Request) -> HTMLResponse:
|
|
"""Paid-traffic landing page — odchudzona pod konwersję instalacji APK.
|
|
noindex (patrz get.html) — nie ma konkurować w SERP z entity-stronami."""
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/get.html",
|
|
{"base_url": base_url(), "canonical": f"{base_url()}/get"},
|
|
)
|
|
|
|
|
|
@router.get("/2257", response_class=HTMLResponse)
|
|
def page_2257(request: Request) -> HTMLResponse:
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"seo/page_2257.html",
|
|
{"base_url": base_url(), "canonical": f"{base_url()}/2257"},
|
|
)
|
|
|
|
|
|
# --- robots + sitemap -------------------------------------------------------------
|
|
|
|
|
|
@router.get("/{fname}.html", response_class=PlainTextResponse, include_in_schema=False)
|
|
def gsc_site_verification(fname: str) -> PlainTextResponse:
|
|
"""Plik weryfikacyjny Google Search Console (metoda 'Plik HTML').
|
|
|
|
Token z env GOON_GSC_FILE (np. 'google67b49088b5416adc' — bez '.html'). Google
|
|
pobiera /<token>.html i oczekuje body 'google-site-verification: <token>.html'.
|
|
Każda inna nazwa .html → 404. Nie koliduje z innymi trasami (żadna inna nie jest
|
|
jednosegmentowym *.html w rootcie).
|
|
"""
|
|
expected = os.environ.get("GOON_GSC_FILE", "")
|
|
if expected and fname == expected:
|
|
return PlainTextResponse(f"google-site-verification: {fname}.html")
|
|
raise HTTPException(status_code=404, detail="not found")
|
|
|
|
|
|
@router.get("/robots.txt", response_class=PlainTextResponse)
|
|
def robots() -> PlainTextResponse:
|
|
body = (
|
|
"User-agent: *\n"
|
|
"Allow: /\n"
|
|
"Disallow: /proxy/\n"
|
|
"Disallow: /ui/\n"
|
|
f"Sitemap: {base_url()}/sitemap.xml\n"
|
|
)
|
|
return PlainTextResponse(body)
|
|
|
|
|
|
def _count(session: Session, stmt) -> int:
|
|
return session.execute(select(func.count()).select_from(stmt.subquery())).scalar_one()
|
|
|
|
|
|
@router.get("/sitemap.xml")
|
|
def sitemap_index(session: Annotated[Session, Depends(get_session)]) -> Response:
|
|
"""Sitemap index — listuje paginowane pod-sitemapy per typ encji."""
|
|
n_perf = _count(
|
|
session,
|
|
select(Performer.id).where(_performer_indexable_exists()),
|
|
)
|
|
n_studio = _count(
|
|
session,
|
|
select(Studio.id).where(
|
|
exists(select(1).where(Scene.studio_id == Studio.id).where(_live_playback_exists()))
|
|
),
|
|
)
|
|
n_scene = _count(session, _indexable_scenes().with_only_columns(Scene.id))
|
|
|
|
bu = base_url()
|
|
parts = ['<?xml version="1.0" encoding="UTF-8"?>',
|
|
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">']
|
|
for kind, total in (("performers", n_perf), ("studios", n_studio), ("scenes", n_scene)):
|
|
pages = max(1, -(-total // _SITEMAP_PAGE)) # ceil
|
|
for p in range(pages):
|
|
parts.append(f"<sitemap><loc>{bu}/sitemap/{kind}-{p}.xml</loc></sitemap>")
|
|
parts.append("</sitemapindex>")
|
|
return Response("\n".join(parts), media_type="application/xml")
|
|
|
|
|
|
@router.get("/sitemap/{kind}-{page}.xml")
|
|
def sitemap_page(
|
|
kind: str,
|
|
page: int,
|
|
session: Annotated[Session, Depends(get_session)],
|
|
) -> Response:
|
|
bu = base_url()
|
|
off = page * _SITEMAP_PAGE
|
|
urls: list[str] = []
|
|
|
|
if kind == "performers":
|
|
rows = session.execute(
|
|
select(Performer.slug)
|
|
.where(_performer_indexable_exists())
|
|
.order_by(Performer.created_at.asc())
|
|
.offset(off)
|
|
.limit(_SITEMAP_PAGE)
|
|
).scalars()
|
|
urls = [f"{bu}/p/{slug}" for slug in rows]
|
|
elif kind == "studios":
|
|
rows = session.execute(
|
|
select(Studio.slug)
|
|
.where(
|
|
exists(select(1).where(Scene.studio_id == Studio.id).where(_live_playback_exists()))
|
|
)
|
|
.order_by(Studio.created_at.asc())
|
|
.offset(off)
|
|
.limit(_SITEMAP_PAGE)
|
|
).scalars()
|
|
urls = [f"{bu}/studio/{slug}" for slug in rows]
|
|
elif kind == "scenes":
|
|
rows = session.execute(
|
|
_indexable_scenes()
|
|
.with_only_columns(Scene.id)
|
|
.order_by(Scene.created_at.asc())
|
|
.offset(off)
|
|
.limit(_SITEMAP_PAGE)
|
|
).scalars()
|
|
urls = [f"{bu}/scene/{sid}" for sid in rows]
|
|
else:
|
|
raise HTTPException(status_code=404, detail="unknown sitemap kind")
|
|
|
|
parts = ['<?xml version="1.0" encoding="UTF-8"?>',
|
|
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">']
|
|
parts += [f"<url><loc>{u}</loc></url>" for u in urls]
|
|
parts.append("</urlset>")
|
|
return Response("\n".join(parts), media_type="application/xml")
|