diff --git a/alembic/versions/20260531_0019_taxonomy_scene_counts.py b/alembic/versions/20260531_0019_taxonomy_scene_counts.py new file mode 100644 index 0000000..6ccc5a3 --- /dev/null +++ b/alembic/versions/20260531_0019_taxonomy_scene_counts.py @@ -0,0 +1,54 @@ +"""taxonomy scene_count denormalization — tags / performers / studios + +Revision ID: 0019_taxonomy_scene_counts +Revises: 0018_movie_play_progress +Create Date: 2026-05-31 + +Perf fix (user-report 2026-05-31 "wolne ładowanie scen/favorites/tags"): baza urosła +do 1.69M scen / 6.3M scene_tags, a /tags?order=popular liczył scene_count dla KAŻDEGO +tagu na żywo (agregacja 6.3M scene_tags + EXISTS playback, external-merge sort 22MB) — +~4.3s, i to razy 2 (total + items). Analogicznie performers/studios + favorites. + +Denormalizujemy `scene_count` na tags/performers/studios. Worker przelicza je w tle +(`_job_refresh_taxonomy_counts`, co `GOON_SCHED_TAXONOMY_COUNTS_HOURS`=3h jednym +UPDATE...FROM). Endpointy czytają gotową kolumnę + ORDER BY indexed DESC → <20ms. + +scene_count = liczba scen z danym tagiem/performerem/studiem mających ≥1 ŻYWY +playback_source (dead_at IS NULL) — dokładnie ta sama definicja co dotychczasowe +live-aggregaty (has_live_playback filter w taxonomies.py / favorites.py). + +Counts są do ~3h nieświeże — dla "(123)" przy filtrze i sortu "popular" bez znaczenia. +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +revision: str = "0019_taxonomy_scene_counts" +down_revision: str | None = "0018_movie_play_progress" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +_TABLES = ("tags", "performers", "studios") + + +def upgrade() -> None: + for tbl in _TABLES: + op.add_column( + tbl, + sa.Column( + "scene_count", sa.Integer(), nullable=False, server_default="0" + ), + ) + # DESC index — ORDER BY scene_count DESC (sortowanie "popular"). + op.create_index( + f"ix_{tbl}_scene_count", + tbl, + [sa.text("scene_count DESC")], + ) + + +def downgrade() -> None: + for tbl in _TABLES: + op.drop_index(f"ix_{tbl}_scene_count", table_name=tbl) + op.drop_column(tbl, "scene_count") diff --git a/app/api/favorites.py b/app/api/favorites.py index f49e6ac..1d3216e 100644 --- a/app/api/favorites.py +++ b/app/api/favorites.py @@ -27,7 +27,7 @@ from typing import Annotated from fastapi import APIRouter, Depends, HTTPException, status from pydantic import BaseModel -from sqlalchemy import func, select +from sqlalchemy import select from sqlalchemy.orm import Session from app.auth import require_api_key @@ -77,25 +77,10 @@ def list_favorites( perf_ids = [perf.id for _, perf in rows] last_seen_by_perf = {fav.performer_id: fav.last_seen_at for fav, _ in rows} - # Batch: scene_count per performer — filtrujemy `has_live_playback` żeby badge - # `N scenes` zgadzał się z tym co widać w PerformerScenes (mobile filtruje - # `has_playback=true`). TPDB/StashDB sync wstawia metadata-only stubs które wlicz - # by się w 2062 dla Aletta Ocean ale w profilu pokazuje tylko 499 oglądalnych. - from sqlalchemy import and_, exists - _scene_count_live_playback = exists().where( - and_( - PlaybackSource.scene_id == ScenePerformer.scene_id, - PlaybackSource.dead_at.is_(None), - ) - ) - scene_counts: dict = dict( - session.execute( - select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id)) - .where(ScenePerformer.performer_id.in_(perf_ids)) - .where(_scene_count_live_playback) - .group_by(ScenePerformer.performer_id) - ).all() - ) + # scene_count: czytamy zdenormalizowany Performer.scene_count (refresh w tle przez + # _job_refresh_taxonomy_counts) — ta sama definicja co przed (sceny z żywym + # playback). Wcześniej grouped count z EXISTS playback per-request. Migracja 0019. + scene_counts: dict = {perf.id: perf.scene_count for _, perf in rows} # Batch: new_count per performer — sceny z created_at > last_seen_at favorite'a. # Każda performerka ma INNY last_seen_at, więc warunek per-row. Trick: GREATEST jest @@ -240,22 +225,8 @@ def list_favorite_studios( studio_ids = [st.id for _, st in rows] last_seen_by_studio = {fav.studio_id: fav.last_seen_at for fav, _ in rows} - # has_live_playback filter — patrz `list_favorites` (performers) wyżej. - from sqlalchemy import and_, exists - _studio_count_live_playback = exists().where( - and_( - PlaybackSource.scene_id == Scene.id, - PlaybackSource.dead_at.is_(None), - ) - ) - scene_counts: dict = dict( - session.execute( - select(Scene.studio_id, func.count(Scene.id)) - .where(Scene.studio_id.in_(studio_ids)) - .where(_studio_count_live_playback) - .group_by(Scene.studio_id) - ).all() - ) + # scene_count: zdenormalizowany Studio.scene_count (refresh w tle, migracja 0019). + scene_counts: dict = {st.id: st.scene_count for _, st in rows} new_counts: dict = {} if studio_ids: diff --git a/app/api/scenes.py b/app/api/scenes.py index 48922e4..e233f2f 100644 --- a/app/api/scenes.py +++ b/app/api/scenes.py @@ -40,6 +40,33 @@ router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(requ _VALID_SORTS = {"created_at", "release_date", "title", "studio"} +# TTL-cache dla count'u scen-z-żywym-playback (default lista bez filtra). Full-scan +# 1.69M scen + EXISTS ~950ms; liczba zmienia się wolno i jest przybliżona (header +# paginacji), więc 10-min cache w pamięci procesu API jest akceptowalny trade-off. +_DEFAULT_COUNT_CACHE: dict = {"ts": 0.0, "val": 0} +_DEFAULT_COUNT_TTL = 600.0 + + +def _default_scene_count(session: Session) -> int: + import time as _time + now = _time.monotonic() + if _DEFAULT_COUNT_CACHE["val"] and (now - _DEFAULT_COUNT_CACHE["ts"]) < _DEFAULT_COUNT_TTL: + return _DEFAULT_COUNT_CACHE["val"] + count_query = select(func.count()).select_from( + select(Scene.id).where( + exists( + select(1).where( + PlaybackSource.scene_id == Scene.id, + PlaybackSource.dead_at.is_(None), + ) + ) + ).subquery() + ) + total = session.execute(count_query).scalar_one() + _DEFAULT_COUNT_CACHE["ts"] = now + _DEFAULT_COUNT_CACHE["val"] = total + return total + def _split_csv(raw: str | None) -> list[str]: if not raw: @@ -283,18 +310,11 @@ def list_scenes( # przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie. if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings: # Fast path: typowy default request (lista bez filtra) — count tylko po - # has_playback (single EXISTS, dobrze zindeksowany). - count_query = select(func.count()).select_from( - select(Scene.id).where( - exists( - select(1).where( - PlaybackSource.scene_id == Scene.id, - PlaybackSource.dead_at.is_(None), - ) - ) - ).subquery() - ) - total = session.execute(count_query).scalar_one() + # has_playback (single EXISTS). Mimo to przy 1.69M scen full-scan z EXISTS + # bierze ~950ms (zmierzone 2026-05-31), a liczba zmienia się wolno (ingest + # ~kilkadziesiąt scen/h) i jest z definicji przybliżona. TTL-cache 10 min: + # pierwszy request po wygaśnięciu płaci ~950ms, reszta czyta z pamięci. + total = _default_scene_count(session) else: total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one() diff --git a/app/api/taxonomies.py b/app/api/taxonomies.py index fe05f14..d1bb7ab 100644 --- a/app/api/taxonomies.py +++ b/app/api/taxonomies.py @@ -22,8 +22,7 @@ from app.db import get_session from app.models.movie import Movie, MovieTag from app.models.movie_playback_source import MoviePlaybackSource from app.models.performer import Performer -from app.models.playback_source import PlaybackSource -from app.models.scene import ScenePerformer, SceneTag +from app.models.scene import ScenePerformer from app.models.studio import Studio from app.models.tag import Tag @@ -108,7 +107,8 @@ def list_tags( if for_movies: # Movie tag count — zliczamy tylko Movies z ≥1 live MoviePlaybackSource. - # Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce). + # Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce). Movies są + # małe (~41k) więc live-aggregat OK; scenes path niżej używa denormalizacji. _movie_live = exists().where( and_( MoviePlaybackSource.movie_id == MovieTag.movie_id, @@ -121,48 +121,55 @@ def list_tags( .group_by(MovieTag.tag_id) .subquery() ) - else: - # has_live_playback filter — zliczamy tylko sceny które user faktycznie zobaczy - # (TPDB/StashDB metadata-only stubs są do mergowania, nie do oglądania). - _live_playback = exists().where( - and_( - PlaybackSource.scene_id == SceneTag.scene_id, - PlaybackSource.dead_at.is_(None), - ) + base = ( + select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count")) + .outerjoin(count_sub, count_sub.c.tag_id == Tag.id) ) - count_sub = ( - select(SceneTag.tag_id, func.count(SceneTag.scene_id).label("c")) - .where(_live_playback) - .group_by(SceneTag.tag_id) - .subquery() - ) - base = ( - select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count")) - .outerjoin(count_sub, count_sub.c.tag_id == Tag.id) - ) + if q: + base = base.where(Tag.name.ilike(f"%{q}%")) + if only_with_content: + base = base.where(count_sub.c.tag_id.is_not(None)) + total = session.execute( + select(func.count()).select_from(base.subquery()) + ).scalar_one() + if order in ("popular", "scene_count"): + ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc()) + else: + ordered = base.order_by(Tag.name.asc()) + rows = session.execute( + ordered.offset((page - 1) * per_page).limit(per_page) + ).all() + items = [ + TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c)) + for t, c in rows + ] + return TagListOut(items=items, total=total, page=page, per_page=per_page) + + # Scenes path — czyta zdenormalizowany Tag.scene_count (refresh w tle przez + # _job_refresh_taxonomy_counts). Wcześniej liczył agregat 6.3M scene_tags + + # EXISTS playback per-request (~4.3s ×2). Patrz migracja 0019. + base = select(Tag) if q: base = base.where(Tag.name.ilike(f"%{q}%")) if only_with_content: - # exists() w outerjoin nie inner-joinowałby pustych tagów. Dlatego osobny - # exists check: pasują tylko tagi z ≥1 w subquery. - base = base.where(count_sub.c.tag_id.is_not(None)) + base = base.where(Tag.scene_count > 0) total = session.execute( select(func.count()).select_from(base.subquery()) ).scalar_one() if order in ("popular", "scene_count"): - ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc()) + ordered = base.order_by(Tag.scene_count.desc(), Tag.name.asc()) else: ordered = base.order_by(Tag.name.asc()) - rows = session.execute( + tags_page = session.execute( ordered.offset((page - 1) * per_page).limit(per_page) - ).all() + ).scalars().all() items = [ - TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c)) - for t, c in rows + TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=t.scene_count) + for t in tags_page ] return TagListOut(items=items, total=total, page=page, per_page=per_page) @@ -178,23 +185,9 @@ def list_performers( if order not in ("scene_count", "popular", "name"): raise HTTPException(status_code=400, detail="order must be 'scene_count' or 'name'") - # has_live_playback filter — patrz list_tags wyżej. - _perf_live_playback = exists().where( - and_( - PlaybackSource.scene_id == ScenePerformer.scene_id, - PlaybackSource.dead_at.is_(None), - ) - ) - count_sub = ( - select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id).label("c")) - .where(_perf_live_playback) - .group_by(ScenePerformer.performer_id) - .subquery() - ) - base = ( - select(Performer, func.coalesce(count_sub.c.c, 0).label("scene_count")) - .outerjoin(count_sub, count_sub.c.performer_id == Performer.id) - ) + # Czyta zdenormalizowany Performer.scene_count (refresh w tle). Wcześniej agregat + # 3M scene_performers + EXISTS playback per-request. Patrz migracja 0019. + base = select(Performer) if q: base = base.where(Performer.name_normalized.ilike(f"%{q.lower()}%")) @@ -204,14 +197,14 @@ def list_performers( if order in ("scene_count", "popular"): ordered = base.order_by( - func.coalesce(count_sub.c.c, 0).desc(), Performer.canonical_name.asc() + Performer.scene_count.desc(), Performer.canonical_name.asc() ) else: ordered = base.order_by(Performer.canonical_name.asc()) - rows = session.execute( + perfs_page = session.execute( ordered.offset((page - 1) * per_page).limit(per_page) - ).all() + ).scalars().all() items = [ PerformerCount( @@ -219,9 +212,9 @@ def list_performers( canonical_name=p.canonical_name, slug=p.slug, gender=p.gender.value if p.gender else None, - scene_count=int(c), + scene_count=p.scene_count, ) - for p, c in rows + for p in perfs_page ] return PerformerListOut(items=items, total=total, page=page, per_page=per_page) @@ -248,6 +241,7 @@ def list_studios( raise HTTPException(status_code=400, detail="order must be 'name' or 'scene_count'") if for_movies: + # Movies małe (~41k) — live aggregat OK. Scenes path niżej = denormalizacja. _movie_live = exists().where( and_( MoviePlaybackSource.movie_id == Movie.id, @@ -261,42 +255,50 @@ def list_studios( .group_by(Movie.studio_id) .subquery() ) - else: - # has_live_playback filter — patrz list_tags wyżej. - _studio_live_playback = exists().where( - and_( - PlaybackSource.scene_id == Scene.id, - PlaybackSource.dead_at.is_(None), - ) + base = ( + select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count")) + .outerjoin(count_sub, count_sub.c.studio_id == Studio.id) ) - count_sub = ( - select(Scene.studio_id, func.count(Scene.id).label("c")) - .where(Scene.studio_id.is_not(None)) - .where(_studio_live_playback) - .group_by(Scene.studio_id) - .subquery() - ) - base = ( - select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count")) - .outerjoin(count_sub, count_sub.c.studio_id == Studio.id) - ) + if q: + base = base.where(Studio.name.ilike(f"%{q}%")) + if only_with_content: + base = base.where(count_sub.c.studio_id.is_not(None)) + total = session.execute( + select(func.count()).select_from(base.subquery()) + ).scalar_one() + if order in ("scene_count", "popular"): + ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc()) + else: + ordered = base.order_by(Studio.name_normalized.asc()) + rows = session.execute( + ordered.offset((page - 1) * per_page).limit(per_page) + ).all() + items = [ + StudioCount(id=s.id, name=s.name, slug=s.slug, network=s.network, scene_count=int(c)) + for s, c in rows + ] + return StudioListOut(items=items, total=total, page=page, per_page=per_page) + + # Scenes path — czyta zdenormalizowany Studio.scene_count (refresh w tle). + # Wcześniej agregat 1.69M scenes + EXISTS playback per-request. Patrz migracja 0019. + base = select(Studio) if q: base = base.where(Studio.name.ilike(f"%{q}%")) if only_with_content: - base = base.where(count_sub.c.studio_id.is_not(None)) + base = base.where(Studio.scene_count > 0) total = session.execute( select(func.count()).select_from(base.subquery()) ).scalar_one() if order in ("scene_count", "popular"): - ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc()) + ordered = base.order_by(Studio.scene_count.desc(), Studio.name.asc()) else: ordered = base.order_by(Studio.name_normalized.asc()) - rows = session.execute( + studios_page = session.execute( ordered.offset((page - 1) * per_page).limit(per_page) - ).all() + ).scalars().all() items = [ StudioCount( @@ -304,9 +306,9 @@ def list_studios( name=s.name, slug=s.slug, network=s.network, - scene_count=int(c), + scene_count=s.scene_count, ) - for s, c in rows + for s in studios_page ] return StudioListOut(items=items, total=total, page=page, per_page=per_page) diff --git a/app/config.py b/app/config.py index f28eaf5..b329114 100644 --- a/app/config.py +++ b/app/config.py @@ -88,6 +88,13 @@ class Settings(BaseSettings): sched_bulk_dedup_hours: int = Field( default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS" ) + # Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na + # tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta + # gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence — + # counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off. + sched_taxonomy_counts_hours: int = Field( + default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS" + ) # Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens # w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log). diff --git a/app/models/performer.py b/app/models/performer.py index ce16c7b..07c966e 100644 --- a/app/models/performer.py +++ b/app/models/performer.py @@ -37,6 +37,11 @@ class Performer(UUIDPKMixin, TimestampMixin, Base): search_run_count: Mapped[int] = mapped_column( Integer, nullable=False, default=0, server_default="0" ) + # Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja + # 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites. + scene_count: Mapped[int] = mapped_column( + Integer, nullable=False, default=0, server_default="0" + ) class PerformerAlias(Base): diff --git a/app/models/studio.py b/app/models/studio.py index 88c4ae1..a78f938 100644 --- a/app/models/studio.py +++ b/app/models/studio.py @@ -1,7 +1,7 @@ import uuid from datetime import datetime -from sqlalchemy import DateTime, Float, ForeignKey, String, UniqueConstraint, func +from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, UniqueConstraint, func from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Mapped, mapped_column @@ -19,6 +19,11 @@ class Studio(UUIDPKMixin, TimestampMixin, Base): ) network: Mapped[str | None] = mapped_column(String(256)) homepage_url: Mapped[str | None] = mapped_column(String(512)) + # Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja + # 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites. + scene_count: Mapped[int] = mapped_column( + Integer, nullable=False, default=0, server_default="0" + ) class StudioAlias(Base): diff --git a/app/models/tag.py b/app/models/tag.py index 33882bc..0181a65 100644 --- a/app/models/tag.py +++ b/app/models/tag.py @@ -1,6 +1,6 @@ import uuid -from sqlalchemy import ForeignKey, String +from sqlalchemy import ForeignKey, Integer, String from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Mapped, mapped_column @@ -16,3 +16,9 @@ class Tag(UUIDPKMixin, TimestampMixin, Base): UUID(as_uuid=True), ForeignKey("tags.id", ondelete="SET NULL") ) description: Mapped[str | None] = mapped_column(String(1024)) + # Denormalizowany licznik scen z żywym playback (refresh w tle przez + # _job_refresh_taxonomy_counts). Patrz migracja 0019. NIE źródło prawdy — + # do sortu "popular" + badge "(N)" w filtrach. + scene_count: Mapped[int] = mapped_column( + Integer, nullable=False, default=0, server_default="0" + ) diff --git a/app/scheduler/jobs.py b/app/scheduler/jobs.py index f5b241e..7469eb1 100644 --- a/app/scheduler/jobs.py +++ b/app/scheduler/jobs.py @@ -130,6 +130,22 @@ def _job_movie_ingest() -> None: log.exception("[scheduler] movie ingest %s failed", name) +def _job_refresh_taxonomy_counts() -> None: + """Przelicza denormalizowane scene_count na tags/performers/studios. + + Hot-path /tags|/performers|/studios|/favorites czyta gotową kolumnę zamiast + agregować 6.3M scene_tags per-request (~4.3s → <20ms). Patrz migracja 0019 + + app/scheduler/taxonomy_counts.py. + """ + log.info("[scheduler] taxonomy counts refresh starting") + try: + from app.scheduler.taxonomy_counts import refresh_taxonomy_counts + changed = refresh_taxonomy_counts() + log.info("[scheduler] taxonomy counts refresh done: %s", changed) + except Exception: + log.exception("[scheduler] taxonomy counts refresh failed") + + def _job_bulk_dedup_performers() -> None: """Pair-wise dedup po performer overlap — safety net dla duplikatów które resolver-time scoring nie złapał. @@ -257,6 +273,17 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler: ) log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"]) + if cfg.get("taxonomy_counts_hours"): + sched.add_job( + _job_refresh_taxonomy_counts, + IntervalTrigger(hours=cfg["taxonomy_counts_hours"], start_date=INTERVAL_ANCHOR), + id="taxonomy_counts", + replace_existing=True, + max_instances=1, + coalesce=True, + ) + log.info("scheduler: taxonomy-counts refresh every %dh", cfg["taxonomy_counts_hours"]) + if cfg.get("performer_continuous_seconds"): refresh_days = cfg.get("performer_continuous_refresh_days") or 30 seconds = cfg["performer_continuous_seconds"] @@ -296,4 +323,7 @@ DEFAULT_CONFIG: dict[str, Any] = { # każdego co 30 dni. "performer_continuous_seconds": 15, "performer_continuous_refresh_days": 30, + # Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers| + # /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia. + "taxonomy_counts_hours": 3, } diff --git a/app/scheduler/taxonomy_counts.py b/app/scheduler/taxonomy_counts.py new file mode 100644 index 0000000..6861254 --- /dev/null +++ b/app/scheduler/taxonomy_counts.py @@ -0,0 +1,89 @@ +"""Refresh denormalizowanych `scene_count` na tags / performers / studios. + +Liczniki są utrzymywane w tle (zamiast liczone per-request) bo agregacja po 6.3M +scene_tags / 3M scene_performers z EXISTS do 1.15M playback_sources zajmuje ~4.3s — +nie do zaakceptowania w hot-path UI (/tags, /performers, /studios, /favorites). + +Definicja (identyczna z dotychczasowym has_live_playback filtrem w taxonomies.py): + scene_count = liczba scen z danym tagiem/performerem/studiem mających ≥1 + playback_source z dead_at IS NULL. + +Każdy UPDATE robi pełny LEFT JOIN (tag/performer/studio) ⨝ agregat → ustawia 0 dla +sierot. `IS DISTINCT FROM` pomija przepisywanie niezmienionych wierszy (mniej WAL/bloat). +Całość ~5-10s, leci co kilka godzin — counts do tego stale, co dla sortu "popular" i +badge "(N)" jest bez znaczenia. +""" +from __future__ import annotations + +import logging + +from sqlalchemy import text + +from app.db import session_scope + +log = logging.getLogger(__name__) + +# Wspólny predykat: scena ma ≥1 żywy playback_source. +_LIVE = ( + "EXISTS (SELECT 1 FROM playback_sources ps " + "WHERE ps.scene_id = {scene_col} AND ps.dead_at IS NULL)" +) + +_TAGS_SQL = text( + f""" + UPDATE tags t SET scene_count = COALESCE(a.c, 0) + FROM tags base + LEFT JOIN ( + SELECT st.tag_id, count(*) AS c + FROM scene_tags st + WHERE {_LIVE.format(scene_col="st.scene_id")} + GROUP BY st.tag_id + ) a ON a.tag_id = base.id + WHERE t.id = base.id AND t.scene_count IS DISTINCT FROM COALESCE(a.c, 0) + """ +) + +_PERFORMERS_SQL = text( + f""" + UPDATE performers p SET scene_count = COALESCE(a.c, 0) + FROM performers base + LEFT JOIN ( + SELECT sp.performer_id, count(*) AS c + FROM scene_performers sp + WHERE {_LIVE.format(scene_col="sp.scene_id")} + GROUP BY sp.performer_id + ) a ON a.performer_id = base.id + WHERE p.id = base.id AND p.scene_count IS DISTINCT FROM COALESCE(a.c, 0) + """ +) + +_STUDIOS_SQL = text( + f""" + UPDATE studios s SET scene_count = COALESCE(a.c, 0) + FROM studios base + LEFT JOIN ( + SELECT sc.studio_id, count(*) AS c + FROM scenes sc + WHERE sc.studio_id IS NOT NULL AND {_LIVE.format(scene_col="sc.id")} + GROUP BY sc.studio_id + ) a ON a.studio_id = base.id + WHERE s.id = base.id AND s.scene_count IS DISTINCT FROM COALESCE(a.c, 0) + """ +) + + +def refresh_taxonomy_counts() -> dict[str, int]: + """Przelicza scene_count dla tags/performers/studios. Zwraca rowcount per tabela + (ile wierszy faktycznie się zmieniło).""" + out: dict[str, int] = {} + with session_scope() as session: + for name, stmt in ( + ("tags", _TAGS_SQL), + ("performers", _PERFORMERS_SQL), + ("studios", _STUDIOS_SQL), + ): + res = session.execute(stmt) + out[name] = res.rowcount or 0 + # Commit per-tabela — długie transakcje trzymałyby locki na hot tables. + session.commit() + return out diff --git a/app/scheduler/worker.py b/app/scheduler/worker.py index 9873cf7..acf08ea 100644 --- a/app/scheduler/worker.py +++ b/app/scheduler/worker.py @@ -164,6 +164,8 @@ def run_forever() -> int: # Bulk-dedup performers — safety net dla duplikatów które resolver # pominął (np. freshporno scen przed fixem release_date). Run 12h. "bulk_dedup_hours": getattr(settings, "sched_bulk_dedup_hours", 12) or None, + # Taxonomy scene_count refresh — denormalizacja liczników (perf fix 0019). + "taxonomy_counts_hours": getattr(settings, "sched_taxonomy_counts_hours", 3) or None, } sched = build_scheduler(cfg) log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))