"""Publiczna, crawlowalna powierzchnia SEO — programmatic entity pages. Jedyny publiczny router HTML poza /static i healthchecks — NIE wymaga api key, bo Googlebot/użytkownik musi dotrzeć bez tokenu. Cel: łapać nawigacyjny long-tail (nazwy performerów / studiów / tytuły scen), którego mainstream-SEO nie indeksuje, a który realnie generuje organiczny ruch → pobranie apki. Zasady (świadome, nie przypadkowe): * **LINK-OUT only.** Strony renderują metadane i odsyłają do źródeł (`page_url`). NIE eksponujemy `stream_url`/`embed_url` — to zostaje value-add apki i trzyma legalny profil "agregatora/wyszukiwarki" (dowozimy ruch tubom, nie re-streamujemy). * **Age-gate = client-side overlay** (cookie `age_ok`). Treść jest w HTML, więc crawler ją indeksuje; overlay zasłania ją tylko ludziom do potwierdzenia 18+. Dodatkowo `RTA` meta tag dla filtrów rodzicielskich. * **Blacklist respektowany** — te same wykluczenia performer/studio/tag co w /scenes. Nie publikujemy SEO-stron dla zblacklistowanej treści. * **Anti-thin-page** — strona encji powstaje tylko gdy ma realną zawartość (performer/studio z ≥1 żywą sceną; scena z ≥1 żywym source). Pusta encja → 404, nie pusty doorway (Google karze masowe cienkie strony). """ from __future__ import annotations import os import uuid from datetime import date from pathlib import Path from typing import Annotated from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import HTMLResponse, PlainTextResponse, Response from fastapi.templating import Jinja2Templates from sqlalchemy import exists, func, select from sqlalchemy.orm import Session from app.api.scenes import _needs_proxy, _wrap_image_proxy from app.db import get_session from app.models.blacklist import ( BlacklistedPerformer, BlacklistedStudio, BlacklistedTag, ) from app.models.performer import Performer, PerformerAlias from app.models.playback_source import PlaybackSource from app.models.scene import Scene, ScenePerformer, SceneTag from app.models.studio import Studio from app.models.tag import Tag _TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates" templates = Jinja2Templates(directory=str(_TEMPLATES_DIR)) # Analityka — wstrzykiwana do każdej strony SEO tylko gdy odpowiedni env jest ustawiony. # Puste = tag się nie renderuje (zero third-party requestów, zachowanie bez zmian). # Włączenie = ustaw zmienną w .env na VPS + restart, bez zmian w kodzie. templates.env.globals["gtm_id"] = os.environ.get("GOON_GTM_ID", "") templates.env.globals["ga4_id"] = os.environ.get("GOON_GA4_ID", "") templates.env.globals["gsc_verify"] = os.environ.get("GOON_GSC_VERIFY", "") # Limit URL-i na pojedynczy plik sitemap (spec: max 50k). Trzymamy z zapasem. _SITEMAP_PAGE = 25_000 # Ile scen renderujemy na stronie encji (performer/studio) — pełna lista 1000+ scen # to thin/slow; bierzemy najświeższe N, reszta i tak wpada przez sitemap scen. _SCENES_PER_ENTITY = 120 router = APIRouter(tags=["seo"]) def base_url() -> str: """Publiczny origin pod którym serwowane są te strony (do canonical/sitemap/OG).""" return os.environ.get("BACKEND_PUBLIC_URL", "https://goon-foss.org").rstrip("/") # --- reużywalne fragmenty zapytań ------------------------------------------------- def _live_playback_exists(): """EXISTS: scena ma ≥1 żywy (dead_at IS NULL) playback_source.""" return exists( select(1).where( PlaybackSource.scene_id == Scene.id, PlaybackSource.dead_at.is_(None), ) ) def _not_blacklisted(): """Lista warunków WHERE wykluczających zblacklistowaną treść (performer/studio/tag). Te same reguły co w GET /scenes — żeby SEO nie publikowało tego, co katalog ukrywa. """ return [ ~exists( select(1) .select_from(ScenePerformer) .join( BlacklistedPerformer, BlacklistedPerformer.performer_id == ScenePerformer.performer_id, ) .where(ScenePerformer.scene_id == Scene.id) ), ~Scene.studio_id.in_(select(BlacklistedStudio.studio_id)), ~exists( select(1) .select_from(SceneTag) .join(BlacklistedTag, BlacklistedTag.tag_id == SceneTag.tag_id) .where(SceneTag.scene_id == Scene.id) ), ] def _indexable_scenes(): """SELECT Scene żywych, nie-zblacklistowanych scen — baza pod listy i sitemap.""" stmt = select(Scene).where(_live_playback_exists()) for cond in _not_blacklisted(): stmt = stmt.where(cond) return stmt def _performer_indexable_exists(): """EXISTS: performer ma ≥1 indeksowalną (żywą, nie-blacklisted) scenę. Trzyma sitemap performerów w zgodzie z tym, co realnie renderuje `performer_page` — inaczej sitemap zgłaszałby URL-e dające 404. """ sub = ( select(1) .select_from(ScenePerformer) .join(Scene, Scene.id == ScenePerformer.scene_id) .where(ScenePerformer.performer_id == Performer.id) .where(_live_playback_exists()) ) for cond in _not_blacklisted(): sub = sub.where(cond) return exists(sub) def _scene_card_rows(session: Session, scene_ids: list[uuid.UUID]) -> dict[uuid.UUID, dict]: """Batch: dla listy scen zbierz dane do karty (studio name/slug, #źródeł, thumb).""" if not scene_ids: return {} out: dict[uuid.UUID, dict] = {sid: {"sources": 0, "thumb": None} for sid in scene_ids} # liczba żywych źródeł + pierwszy thumbnail (z page_url do proxy referer) pb_rows = session.execute( select(PlaybackSource.scene_id, PlaybackSource.thumbnail_url, PlaybackSource.page_url) .where( PlaybackSource.scene_id.in_(scene_ids), PlaybackSource.dead_at.is_(None), ) ).all() for sid, thumb, page_url in pb_rows: out[sid]["sources"] += 1 if out[sid]["thumb"] is None and thumb: if _needs_proxy(thumb): thumb = _wrap_image_proxy(thumb, page_url) out[sid]["thumb"] = thumb return out def _iso_duration(seconds: int | None) -> str | None: """sekundy → ISO-8601 (PT#M#S) dla schema.org VideoObject.duration.""" if not seconds or seconds <= 0: return None m, s = divmod(int(seconds), 60) return f"PT{m}M{s}S" # --- strony encji ----------------------------------------------------------------- @router.get("/p/{slug}", response_class=HTMLResponse) def performer_page( slug: str, request: Request, session: Annotated[Session, Depends(get_session)], ) -> HTMLResponse: performer = session.execute( select(Performer).where(Performer.slug == slug) ).scalar_one_or_none() if performer is None: raise HTTPException(status_code=404, detail="performer not found") # Sceny tej osoby — żywe, nie-blacklisted, najświeższe pierwsze. scenes = ( session.execute( _indexable_scenes() .where( exists( select(1).where( ScenePerformer.scene_id == Scene.id, ScenePerformer.performer_id == performer.id, ) ) ) .order_by(Scene.release_date.desc().nullslast(), Scene.created_at.desc()) .limit(_SCENES_PER_ENTITY) ) .scalars() .all() ) if not scenes: # Pusta encja → 404 zamiast thin doorway. raise HTTPException(status_code=404, detail="no indexable scenes for performer") cards = _scene_card_rows(session, [s.id for s in scenes]) studios = { st.id: st for st in session.execute( select(Studio).where( Studio.id.in_({s.studio_id for s in scenes if s.studio_id}) ) ).scalars() } aliases = [ a.alias for a in session.execute( select(PerformerAlias).where(PerformerAlias.performer_id == performer.id) ).scalars() ] return templates.TemplateResponse( request, "seo/performer.html", { "base_url": base_url(), "performer": performer, "aliases": sorted({a for a in aliases if a.lower() != performer.canonical_name.lower()}), "scenes": scenes, "cards": cards, "studios": studios, "canonical": f"{base_url()}/p/{performer.slug}", }, ) @router.get("/studio/{slug}", response_class=HTMLResponse) def studio_page( slug: str, request: Request, session: Annotated[Session, Depends(get_session)], ) -> HTMLResponse: studio = session.execute( select(Studio).where(Studio.slug == slug) ).scalar_one_or_none() if studio is None: raise HTTPException(status_code=404, detail="studio not found") scenes = ( session.execute( _indexable_scenes() .where(Scene.studio_id == studio.id) .order_by(Scene.release_date.desc().nullslast(), Scene.created_at.desc()) .limit(_SCENES_PER_ENTITY) ) .scalars() .all() ) if not scenes: raise HTTPException(status_code=404, detail="no indexable scenes for studio") cards = _scene_card_rows(session, [s.id for s in scenes]) return templates.TemplateResponse( request, "seo/studio.html", { "base_url": base_url(), "studio": studio, "scenes": scenes, "cards": cards, "canonical": f"{base_url()}/studio/{studio.slug}", }, ) @router.get("/scene/{scene_id}", response_class=HTMLResponse) def scene_page( scene_id: uuid.UUID, request: Request, session: Annotated[Session, Depends(get_session)], ) -> HTMLResponse: scene = session.get(Scene, scene_id) if scene is None: raise HTTPException(status_code=404, detail="scene not found") # Źródła — żywe, deduplikowane po origin (pokazujemy 1 link per tube). sources_raw = ( session.execute( select(PlaybackSource) .where( PlaybackSource.scene_id == scene.id, PlaybackSource.dead_at.is_(None), ) .order_by(PlaybackSource.origin.asc()) ) .scalars() .all() ) if not sources_raw: raise HTTPException(status_code=404, detail="scene has no live sources") seen_origins: set[str] = set() sources = [] thumb: str | None = None for s in sources_raw: if thumb is None and s.thumbnail_url: thumb = _wrap_image_proxy(s.thumbnail_url, s.page_url) if _needs_proxy(s.thumbnail_url) else s.thumbnail_url if s.origin in seen_origins: continue seen_origins.add(s.origin) label = s.origin.split(":", 1)[1] if ":" in s.origin else s.origin sources.append({"label": label, "page_url": s.page_url, "quality": s.quality}) studio = session.get(Studio, scene.studio_id) if scene.studio_id else None performers = ( session.execute( select(Performer) .join(ScenePerformer, ScenePerformer.performer_id == Performer.id) .where(ScenePerformer.scene_id == scene.id) .order_by(ScenePerformer.position.asc().nullslast()) ) .scalars() .all() ) tags = ( session.execute( select(Tag) .join(SceneTag, SceneTag.tag_id == Tag.id) .where(SceneTag.scene_id == scene.id) ) .scalars() .all() ) return templates.TemplateResponse( request, "seo/scene.html", { "base_url": base_url(), "scene": scene, "studio": studio, "performers": performers, "tags": tags, "sources": sources, "thumb": thumb, "iso_duration": _iso_duration(scene.duration_sec), "canonical": f"{base_url()}/scene/{scene.id}", }, ) @router.get("/", response_class=HTMLResponse) def landing( request: Request, session: Annotated[Session, Depends(get_session)], ) -> HTMLResponse: """Strona główna — crawl-entry. Najświeższe indeksowalne sceny + CTA.""" scenes = ( session.execute( _indexable_scenes() .order_by(Scene.created_at.desc()) .limit(48) ) .scalars() .all() ) cards = _scene_card_rows(session, [s.id for s in scenes]) studios = { st.id: st for st in session.execute( select(Studio).where(Studio.id.in_({s.studio_id for s in scenes if s.studio_id})) ).scalars() } return templates.TemplateResponse( request, "seo/landing.html", { "base_url": base_url(), "scenes": scenes, "cards": cards, "studios": studios, "canonical": f"{base_url()}/", }, ) @router.get("/get", response_class=HTMLResponse) def get_app(request: Request) -> HTMLResponse: """Paid-traffic landing page — odchudzona pod konwersję instalacji APK. noindex (patrz get.html) — nie ma konkurować w SERP z entity-stronami.""" return templates.TemplateResponse( request, "seo/get.html", {"base_url": base_url(), "canonical": f"{base_url()}/get"}, ) @router.get("/2257", response_class=HTMLResponse) def page_2257(request: Request) -> HTMLResponse: return templates.TemplateResponse( request, "seo/page_2257.html", {"base_url": base_url(), "canonical": f"{base_url()}/2257"}, ) # --- robots + sitemap ------------------------------------------------------------- @router.get("/{fname}.html", response_class=PlainTextResponse, include_in_schema=False) def gsc_site_verification(fname: str) -> PlainTextResponse: """Plik weryfikacyjny Google Search Console (metoda 'Plik HTML'). Token z env GOON_GSC_FILE (np. 'google67b49088b5416adc' — bez '.html'). Google pobiera /.html i oczekuje body 'google-site-verification: .html'. Każda inna nazwa .html → 404. Nie koliduje z innymi trasami (żadna inna nie jest jednosegmentowym *.html w rootcie). """ expected = os.environ.get("GOON_GSC_FILE", "") if expected and fname == expected: return PlainTextResponse(f"google-site-verification: {fname}.html") raise HTTPException(status_code=404, detail="not found") @router.get("/robots.txt", response_class=PlainTextResponse) def robots() -> PlainTextResponse: body = ( "User-agent: *\n" "Allow: /\n" "Disallow: /proxy/\n" "Disallow: /ui/\n" f"Sitemap: {base_url()}/sitemap.xml\n" ) return PlainTextResponse(body) def _count(session: Session, stmt) -> int: return session.execute(select(func.count()).select_from(stmt.subquery())).scalar_one() @router.get("/sitemap.xml") def sitemap_index(session: Annotated[Session, Depends(get_session)]) -> Response: """Sitemap index — listuje paginowane pod-sitemapy per typ encji.""" n_perf = _count( session, select(Performer.id).where(_performer_indexable_exists()), ) n_studio = _count( session, select(Studio.id).where( exists(select(1).where(Scene.studio_id == Studio.id).where(_live_playback_exists())) ), ) n_scene = _count(session, _indexable_scenes().with_only_columns(Scene.id)) bu = base_url() parts = ['', ''] for kind, total in (("performers", n_perf), ("studios", n_studio), ("scenes", n_scene)): pages = max(1, -(-total // _SITEMAP_PAGE)) # ceil for p in range(pages): parts.append(f"{bu}/sitemap/{kind}-{p}.xml") parts.append("") return Response("\n".join(parts), media_type="application/xml") @router.get("/sitemap/{kind}-{page}.xml") def sitemap_page( kind: str, page: int, session: Annotated[Session, Depends(get_session)], ) -> Response: bu = base_url() off = page * _SITEMAP_PAGE urls: list[str] = [] if kind == "performers": rows = session.execute( select(Performer.slug) .where(_performer_indexable_exists()) .order_by(Performer.created_at.asc()) .offset(off) .limit(_SITEMAP_PAGE) ).scalars() urls = [f"{bu}/p/{slug}" for slug in rows] elif kind == "studios": rows = session.execute( select(Studio.slug) .where( exists(select(1).where(Scene.studio_id == Studio.id).where(_live_playback_exists())) ) .order_by(Studio.created_at.asc()) .offset(off) .limit(_SITEMAP_PAGE) ).scalars() urls = [f"{bu}/studio/{slug}" for slug in rows] elif kind == "scenes": rows = session.execute( _indexable_scenes() .with_only_columns(Scene.id) .order_by(Scene.created_at.asc()) .offset(off) .limit(_SITEMAP_PAGE) ).scalars() urls = [f"{bu}/scene/{sid}" for sid in rows] else: raise HTTPException(status_code=404, detail="unknown sitemap kind") parts = ['', ''] parts += [f"{u}" for u in urls] parts.append("") return Response("\n".join(parts), media_type="application/xml")