perf(taxonomy): denormalize scene_count for tags/performers/studios

Counts for /tags, /performers, /studios and /favorites were computed live
per-request by aggregating scene_tags / scene_performers with an EXISTS to
playback_sources. As the catalog grew to ~1.7M scenes (6.3M scene_tags) this
ran ~4.3s for /tags?order=popular (x2 incl. the total count) and ~950ms for
the default /scenes count, making those screens load in several seconds.

- migration 0019: add scene_count (+ DESC index) to tags/performers/studios
- background job _job_refresh_taxonomy_counts (every 3h) recomputes the counts
  in one UPDATE..FROM each (IS DISTINCT FROM to skip unchanged rows)
- /tags, /performers, /studios scenes path now read the column + ORDER BY the
  indexed scene_count; for_movies paths keep live aggregation (small tables)
- favorites read denormalized scene_count instead of a grouped EXISTS aggregate
- /scenes default count: 10-min in-process TTL cache (header is approximate)

Measured: /tags?order=popular&per_page=500 ~8s -> 66ms incl. serialization.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-05-31 17:53:48 +02:00
parent 9c49a69a66
commit 2163fee245
11 changed files with 316 additions and 125 deletions

View file

@ -0,0 +1,54 @@
"""taxonomy scene_count denormalization — tags / performers / studios
Revision ID: 0019_taxonomy_scene_counts
Revises: 0018_movie_play_progress
Create Date: 2026-05-31
Perf fix (user-report 2026-05-31 "wolne ładowanie scen/favorites/tags"): baza urosła
do 1.69M scen / 6.3M scene_tags, a /tags?order=popular liczył scene_count dla KAŻDEGO
tagu na żywo (agregacja 6.3M scene_tags + EXISTS playback, external-merge sort 22MB)
~4.3s, i to razy 2 (total + items). Analogicznie performers/studios + favorites.
Denormalizujemy `scene_count` na tags/performers/studios. Worker przelicza je w tle
(`_job_refresh_taxonomy_counts`, co `GOON_SCHED_TAXONOMY_COUNTS_HOURS`=3h jednym
UPDATE...FROM). Endpointy czytają gotową kolumnę + ORDER BY indexed DESC <20ms.
scene_count = liczba scen z danym tagiem/performerem/studiem mających 1 ŻYWY
playback_source (dead_at IS NULL) dokładnie ta sama definicja co dotychczasowe
live-aggregaty (has_live_playback filter w taxonomies.py / favorites.py).
Counts do ~3h nieświeże dla "(123)" przy filtrze i sortu "popular" bez znaczenia.
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "0019_taxonomy_scene_counts"
down_revision: str | None = "0018_movie_play_progress"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
_TABLES = ("tags", "performers", "studios")
def upgrade() -> None:
for tbl in _TABLES:
op.add_column(
tbl,
sa.Column(
"scene_count", sa.Integer(), nullable=False, server_default="0"
),
)
# DESC index — ORDER BY scene_count DESC (sortowanie "popular").
op.create_index(
f"ix_{tbl}_scene_count",
tbl,
[sa.text("scene_count DESC")],
)
def downgrade() -> None:
for tbl in _TABLES:
op.drop_index(f"ix_{tbl}_scene_count", table_name=tbl)
op.drop_column(tbl, "scene_count")

View file

@ -27,7 +27,7 @@ from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy import func, select
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.auth import require_api_key
@ -77,25 +77,10 @@ def list_favorites(
perf_ids = [perf.id for _, perf in rows]
last_seen_by_perf = {fav.performer_id: fav.last_seen_at for fav, _ in rows}
# Batch: scene_count per performer — filtrujemy `has_live_playback` żeby badge
# `N scenes` zgadzał się z tym co widać w PerformerScenes (mobile filtruje
# `has_playback=true`). TPDB/StashDB sync wstawia metadata-only stubs które wlicz
# by się w 2062 dla Aletta Ocean ale w profilu pokazuje tylko 499 oglądalnych.
from sqlalchemy import and_, exists
_scene_count_live_playback = exists().where(
and_(
PlaybackSource.scene_id == ScenePerformer.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
scene_counts: dict = dict(
session.execute(
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id))
.where(ScenePerformer.performer_id.in_(perf_ids))
.where(_scene_count_live_playback)
.group_by(ScenePerformer.performer_id)
).all()
)
# scene_count: czytamy zdenormalizowany Performer.scene_count (refresh w tle przez
# _job_refresh_taxonomy_counts) — ta sama definicja co przed (sceny z żywym
# playback). Wcześniej grouped count z EXISTS playback per-request. Migracja 0019.
scene_counts: dict = {perf.id: perf.scene_count for _, perf in rows}
# Batch: new_count per performer — sceny z created_at > last_seen_at favorite'a.
# Każda performerka ma INNY last_seen_at, więc warunek per-row. Trick: GREATEST jest
@ -240,22 +225,8 @@ def list_favorite_studios(
studio_ids = [st.id for _, st in rows]
last_seen_by_studio = {fav.studio_id: fav.last_seen_at for fav, _ in rows}
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
from sqlalchemy import and_, exists
_studio_count_live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
scene_counts: dict = dict(
session.execute(
select(Scene.studio_id, func.count(Scene.id))
.where(Scene.studio_id.in_(studio_ids))
.where(_studio_count_live_playback)
.group_by(Scene.studio_id)
).all()
)
# scene_count: zdenormalizowany Studio.scene_count (refresh w tle, migracja 0019).
scene_counts: dict = {st.id: st.scene_count for _, st in rows}
new_counts: dict = {}
if studio_ids:

View file

@ -40,6 +40,33 @@ router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(requ
_VALID_SORTS = {"created_at", "release_date", "title", "studio"}
# TTL-cache dla count'u scen-z-żywym-playback (default lista bez filtra). Full-scan
# 1.69M scen + EXISTS ~950ms; liczba zmienia się wolno i jest przybliżona (header
# paginacji), więc 10-min cache w pamięci procesu API jest akceptowalny trade-off.
_DEFAULT_COUNT_CACHE: dict = {"ts": 0.0, "val": 0}
_DEFAULT_COUNT_TTL = 600.0
def _default_scene_count(session: Session) -> int:
import time as _time
now = _time.monotonic()
if _DEFAULT_COUNT_CACHE["val"] and (now - _DEFAULT_COUNT_CACHE["ts"]) < _DEFAULT_COUNT_TTL:
return _DEFAULT_COUNT_CACHE["val"]
count_query = select(func.count()).select_from(
select(Scene.id).where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
).subquery()
)
total = session.execute(count_query).scalar_one()
_DEFAULT_COUNT_CACHE["ts"] = now
_DEFAULT_COUNT_CACHE["val"] = total
return total
def _split_csv(raw: str | None) -> list[str]:
if not raw:
@ -283,18 +310,11 @@ def list_scenes(
# przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie.
if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings:
# Fast path: typowy default request (lista bez filtra) — count tylko po
# has_playback (single EXISTS, dobrze zindeksowany).
count_query = select(func.count()).select_from(
select(Scene.id).where(
exists(
select(1).where(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
).subquery()
)
total = session.execute(count_query).scalar_one()
# has_playback (single EXISTS). Mimo to przy 1.69M scen full-scan z EXISTS
# bierze ~950ms (zmierzone 2026-05-31), a liczba zmienia się wolno (ingest
# ~kilkadziesiąt scen/h) i jest z definicji przybliżona. TTL-cache 10 min:
# pierwszy request po wygaśnięciu płaci ~950ms, reszta czyta z pamięci.
total = _default_scene_count(session)
else:
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()

View file

@ -22,8 +22,7 @@ from app.db import get_session
from app.models.movie import Movie, MovieTag
from app.models.movie_playback_source import MoviePlaybackSource
from app.models.performer import Performer
from app.models.playback_source import PlaybackSource
from app.models.scene import ScenePerformer, SceneTag
from app.models.scene import ScenePerformer
from app.models.studio import Studio
from app.models.tag import Tag
@ -108,7 +107,8 @@ def list_tags(
if for_movies:
# Movie tag count — zliczamy tylko Movies z ≥1 live MoviePlaybackSource.
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce).
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce). Movies są
# małe (~41k) więc live-aggregat OK; scenes path niżej używa denormalizacji.
_movie_live = exists().where(
and_(
MoviePlaybackSource.movie_id == MovieTag.movie_id,
@ -121,21 +121,6 @@ def list_tags(
.group_by(MovieTag.tag_id)
.subquery()
)
else:
# has_live_playback filter — zliczamy tylko sceny które user faktycznie zobaczy
# (TPDB/StashDB metadata-only stubs są do mergowania, nie do oglądania).
_live_playback = exists().where(
and_(
PlaybackSource.scene_id == SceneTag.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(SceneTag.tag_id, func.count(SceneTag.scene_id).label("c"))
.where(_live_playback)
.group_by(SceneTag.tag_id)
.subquery()
)
base = (
select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.tag_id == Tag.id)
@ -143,26 +128,48 @@ def list_tags(
if q:
base = base.where(Tag.name.ilike(f"%{q}%"))
if only_with_content:
# exists() w outerjoin nie inner-joinowałby pustych tagów. Dlatego osobny
# exists check: pasują tylko tagi z ≥1 w subquery.
base = base.where(count_sub.c.tag_id.is_not(None))
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("popular", "scene_count"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
else:
ordered = base.order_by(Tag.name.asc())
rows = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
items = [
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
for t, c in rows
]
return TagListOut(items=items, total=total, page=page, per_page=per_page)
# Scenes path — czyta zdenormalizowany Tag.scene_count (refresh w tle przez
# _job_refresh_taxonomy_counts). Wcześniej liczył agregat 6.3M scene_tags +
# EXISTS playback per-request (~4.3s ×2). Patrz migracja 0019.
base = select(Tag)
if q:
base = base.where(Tag.name.ilike(f"%{q}%"))
if only_with_content:
base = base.where(Tag.scene_count > 0)
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("popular", "scene_count"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
ordered = base.order_by(Tag.scene_count.desc(), Tag.name.asc())
else:
ordered = base.order_by(Tag.name.asc())
rows = session.execute(
tags_page = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
).scalars().all()
items = [
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
for t, c in rows
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=t.scene_count)
for t in tags_page
]
return TagListOut(items=items, total=total, page=page, per_page=per_page)
@ -178,23 +185,9 @@ def list_performers(
if order not in ("scene_count", "popular", "name"):
raise HTTPException(status_code=400, detail="order must be 'scene_count' or 'name'")
# has_live_playback filter — patrz list_tags wyżej.
_perf_live_playback = exists().where(
and_(
PlaybackSource.scene_id == ScenePerformer.scene_id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id).label("c"))
.where(_perf_live_playback)
.group_by(ScenePerformer.performer_id)
.subquery()
)
base = (
select(Performer, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.performer_id == Performer.id)
)
# Czyta zdenormalizowany Performer.scene_count (refresh w tle). Wcześniej agregat
# 3M scene_performers + EXISTS playback per-request. Patrz migracja 0019.
base = select(Performer)
if q:
base = base.where(Performer.name_normalized.ilike(f"%{q.lower()}%"))
@ -204,14 +197,14 @@ def list_performers(
if order in ("scene_count", "popular"):
ordered = base.order_by(
func.coalesce(count_sub.c.c, 0).desc(), Performer.canonical_name.asc()
Performer.scene_count.desc(), Performer.canonical_name.asc()
)
else:
ordered = base.order_by(Performer.canonical_name.asc())
rows = session.execute(
perfs_page = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
).scalars().all()
items = [
PerformerCount(
@ -219,9 +212,9 @@ def list_performers(
canonical_name=p.canonical_name,
slug=p.slug,
gender=p.gender.value if p.gender else None,
scene_count=int(c),
scene_count=p.scene_count,
)
for p, c in rows
for p in perfs_page
]
return PerformerListOut(items=items, total=total, page=page, per_page=per_page)
@ -248,6 +241,7 @@ def list_studios(
raise HTTPException(status_code=400, detail="order must be 'name' or 'scene_count'")
if for_movies:
# Movies małe (~41k) — live aggregat OK. Scenes path niżej = denormalizacja.
_movie_live = exists().where(
and_(
MoviePlaybackSource.movie_id == Movie.id,
@ -261,21 +255,6 @@ def list_studios(
.group_by(Movie.studio_id)
.subquery()
)
else:
# has_live_playback filter — patrz list_tags wyżej.
_studio_live_playback = exists().where(
and_(
PlaybackSource.scene_id == Scene.id,
PlaybackSource.dead_at.is_(None),
)
)
count_sub = (
select(Scene.studio_id, func.count(Scene.id).label("c"))
.where(Scene.studio_id.is_not(None))
.where(_studio_live_playback)
.group_by(Scene.studio_id)
.subquery()
)
base = (
select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count"))
.outerjoin(count_sub, count_sub.c.studio_id == Studio.id)
@ -284,19 +263,42 @@ def list_studios(
base = base.where(Studio.name.ilike(f"%{q}%"))
if only_with_content:
base = base.where(count_sub.c.studio_id.is_not(None))
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("scene_count", "popular"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
else:
ordered = base.order_by(Studio.name_normalized.asc())
rows = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
items = [
StudioCount(id=s.id, name=s.name, slug=s.slug, network=s.network, scene_count=int(c))
for s, c in rows
]
return StudioListOut(items=items, total=total, page=page, per_page=per_page)
# Scenes path — czyta zdenormalizowany Studio.scene_count (refresh w tle).
# Wcześniej agregat 1.69M scenes + EXISTS playback per-request. Patrz migracja 0019.
base = select(Studio)
if q:
base = base.where(Studio.name.ilike(f"%{q}%"))
if only_with_content:
base = base.where(Studio.scene_count > 0)
total = session.execute(
select(func.count()).select_from(base.subquery())
).scalar_one()
if order in ("scene_count", "popular"):
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
ordered = base.order_by(Studio.scene_count.desc(), Studio.name.asc())
else:
ordered = base.order_by(Studio.name_normalized.asc())
rows = session.execute(
studios_page = session.execute(
ordered.offset((page - 1) * per_page).limit(per_page)
).all()
).scalars().all()
items = [
StudioCount(
@ -304,9 +306,9 @@ def list_studios(
name=s.name,
slug=s.slug,
network=s.network,
scene_count=int(c),
scene_count=s.scene_count,
)
for s, c in rows
for s in studios_page
]
return StudioListOut(items=items, total=total, page=page, per_page=per_page)

View file

@ -88,6 +88,13 @@ class Settings(BaseSettings):
sched_bulk_dedup_hours: int = Field(
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
)
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
sched_taxonomy_counts_hours: int = Field(
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
)
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).

View file

@ -37,6 +37,11 @@ class Performer(UUIDPKMixin, TimestampMixin, Base):
search_run_count: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, server_default="0"
)
# Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja
# 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites.
scene_count: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, server_default="0"
)
class PerformerAlias(Base):

View file

@ -1,7 +1,7 @@
import uuid
from datetime import datetime
from sqlalchemy import DateTime, Float, ForeignKey, String, UniqueConstraint, func
from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, UniqueConstraint, func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, mapped_column
@ -19,6 +19,11 @@ class Studio(UUIDPKMixin, TimestampMixin, Base):
)
network: Mapped[str | None] = mapped_column(String(256))
homepage_url: Mapped[str | None] = mapped_column(String(512))
# Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja
# 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites.
scene_count: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, server_default="0"
)
class StudioAlias(Base):

View file

@ -1,6 +1,6 @@
import uuid
from sqlalchemy import ForeignKey, String
from sqlalchemy import ForeignKey, Integer, String
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, mapped_column
@ -16,3 +16,9 @@ class Tag(UUIDPKMixin, TimestampMixin, Base):
UUID(as_uuid=True), ForeignKey("tags.id", ondelete="SET NULL")
)
description: Mapped[str | None] = mapped_column(String(1024))
# Denormalizowany licznik scen z żywym playback (refresh w tle przez
# _job_refresh_taxonomy_counts). Patrz migracja 0019. NIE źródło prawdy —
# do sortu "popular" + badge "(N)" w filtrach.
scene_count: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, server_default="0"
)

View file

@ -130,6 +130,22 @@ def _job_movie_ingest() -> None:
log.exception("[scheduler] movie ingest %s failed", name)
def _job_refresh_taxonomy_counts() -> None:
"""Przelicza denormalizowane scene_count na tags/performers/studios.
Hot-path /tags|/performers|/studios|/favorites czyta gotową kolumnę zamiast
agregować 6.3M scene_tags per-request (~4.3s <20ms). Patrz migracja 0019 +
app/scheduler/taxonomy_counts.py.
"""
log.info("[scheduler] taxonomy counts refresh starting")
try:
from app.scheduler.taxonomy_counts import refresh_taxonomy_counts
changed = refresh_taxonomy_counts()
log.info("[scheduler] taxonomy counts refresh done: %s", changed)
except Exception:
log.exception("[scheduler] taxonomy counts refresh failed")
def _job_bulk_dedup_performers() -> None:
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
resolver-time scoring nie złapał.
@ -257,6 +273,17 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
)
log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"])
if cfg.get("taxonomy_counts_hours"):
sched.add_job(
_job_refresh_taxonomy_counts,
IntervalTrigger(hours=cfg["taxonomy_counts_hours"], start_date=INTERVAL_ANCHOR),
id="taxonomy_counts",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: taxonomy-counts refresh every %dh", cfg["taxonomy_counts_hours"])
if cfg.get("performer_continuous_seconds"):
refresh_days = cfg.get("performer_continuous_refresh_days") or 30
seconds = cfg["performer_continuous_seconds"]
@ -296,4 +323,7 @@ DEFAULT_CONFIG: dict[str, Any] = {
# każdego co 30 dni.
"performer_continuous_seconds": 15,
"performer_continuous_refresh_days": 30,
# Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers|
# /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia.
"taxonomy_counts_hours": 3,
}

View file

@ -0,0 +1,89 @@
"""Refresh denormalizowanych `scene_count` na tags / performers / studios.
Liczniki utrzymywane w tle (zamiast liczone per-request) bo agregacja po 6.3M
scene_tags / 3M scene_performers z EXISTS do 1.15M playback_sources zajmuje ~4.3s
nie do zaakceptowania w hot-path UI (/tags, /performers, /studios, /favorites).
Definicja (identyczna z dotychczasowym has_live_playback filtrem w taxonomies.py):
scene_count = liczba scen z danym tagiem/performerem/studiem mających 1
playback_source z dead_at IS NULL.
Każdy UPDATE robi pełny LEFT JOIN (tag/performer/studio) agregat ustawia 0 dla
sierot. `IS DISTINCT FROM` pomija przepisywanie niezmienionych wierszy (mniej WAL/bloat).
Całość ~5-10s, leci co kilka godzin counts do tego stale, co dla sortu "popular" i
badge "(N)" jest bez znaczenia.
"""
from __future__ import annotations
import logging
from sqlalchemy import text
from app.db import session_scope
log = logging.getLogger(__name__)
# Wspólny predykat: scena ma ≥1 żywy playback_source.
_LIVE = (
"EXISTS (SELECT 1 FROM playback_sources ps "
"WHERE ps.scene_id = {scene_col} AND ps.dead_at IS NULL)"
)
_TAGS_SQL = text(
f"""
UPDATE tags t SET scene_count = COALESCE(a.c, 0)
FROM tags base
LEFT JOIN (
SELECT st.tag_id, count(*) AS c
FROM scene_tags st
WHERE {_LIVE.format(scene_col="st.scene_id")}
GROUP BY st.tag_id
) a ON a.tag_id = base.id
WHERE t.id = base.id AND t.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
"""
)
_PERFORMERS_SQL = text(
f"""
UPDATE performers p SET scene_count = COALESCE(a.c, 0)
FROM performers base
LEFT JOIN (
SELECT sp.performer_id, count(*) AS c
FROM scene_performers sp
WHERE {_LIVE.format(scene_col="sp.scene_id")}
GROUP BY sp.performer_id
) a ON a.performer_id = base.id
WHERE p.id = base.id AND p.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
"""
)
_STUDIOS_SQL = text(
f"""
UPDATE studios s SET scene_count = COALESCE(a.c, 0)
FROM studios base
LEFT JOIN (
SELECT sc.studio_id, count(*) AS c
FROM scenes sc
WHERE sc.studio_id IS NOT NULL AND {_LIVE.format(scene_col="sc.id")}
GROUP BY sc.studio_id
) a ON a.studio_id = base.id
WHERE s.id = base.id AND s.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
"""
)
def refresh_taxonomy_counts() -> dict[str, int]:
"""Przelicza scene_count dla tags/performers/studios. Zwraca rowcount per tabela
(ile wierszy faktycznie się zmieniło)."""
out: dict[str, int] = {}
with session_scope() as session:
for name, stmt in (
("tags", _TAGS_SQL),
("performers", _PERFORMERS_SQL),
("studios", _STUDIOS_SQL),
):
res = session.execute(stmt)
out[name] = res.rowcount or 0
# Commit per-tabela — długie transakcje trzymałyby locki na hot tables.
session.commit()
return out

View file

@ -164,6 +164,8 @@ def run_forever() -> int:
# Bulk-dedup performers — safety net dla duplikatów które resolver
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
"bulk_dedup_hours": getattr(settings, "sched_bulk_dedup_hours", 12) or None,
# Taxonomy scene_count refresh — denormalizacja liczników (perf fix 0019).
"taxonomy_counts_hours": getattr(settings, "sched_taxonomy_counts_hours", 3) or None,
}
sched = build_scheduler(cfg)
log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))