perf(taxonomy): denormalize scene_count for tags/performers/studios
Counts for /tags, /performers, /studios and /favorites were computed live per-request by aggregating scene_tags / scene_performers with an EXISTS to playback_sources. As the catalog grew to ~1.7M scenes (6.3M scene_tags) this ran ~4.3s for /tags?order=popular (x2 incl. the total count) and ~950ms for the default /scenes count, making those screens load in several seconds. - migration 0019: add scene_count (+ DESC index) to tags/performers/studios - background job _job_refresh_taxonomy_counts (every 3h) recomputes the counts in one UPDATE..FROM each (IS DISTINCT FROM to skip unchanged rows) - /tags, /performers, /studios scenes path now read the column + ORDER BY the indexed scene_count; for_movies paths keep live aggregation (small tables) - favorites read denormalized scene_count instead of a grouped EXISTS aggregate - /scenes default count: 10-min in-process TTL cache (header is approximate) Measured: /tags?order=popular&per_page=500 ~8s -> 66ms incl. serialization. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9c49a69a66
commit
2163fee245
11 changed files with 316 additions and 125 deletions
54
alembic/versions/20260531_0019_taxonomy_scene_counts.py
Normal file
54
alembic/versions/20260531_0019_taxonomy_scene_counts.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""taxonomy scene_count denormalization — tags / performers / studios
|
||||
|
||||
Revision ID: 0019_taxonomy_scene_counts
|
||||
Revises: 0018_movie_play_progress
|
||||
Create Date: 2026-05-31
|
||||
|
||||
Perf fix (user-report 2026-05-31 "wolne ładowanie scen/favorites/tags"): baza urosła
|
||||
do 1.69M scen / 6.3M scene_tags, a /tags?order=popular liczył scene_count dla KAŻDEGO
|
||||
tagu na żywo (agregacja 6.3M scene_tags + EXISTS playback, external-merge sort 22MB) —
|
||||
~4.3s, i to razy 2 (total + items). Analogicznie performers/studios + favorites.
|
||||
|
||||
Denormalizujemy `scene_count` na tags/performers/studios. Worker przelicza je w tle
|
||||
(`_job_refresh_taxonomy_counts`, co `GOON_SCHED_TAXONOMY_COUNTS_HOURS`=3h jednym
|
||||
UPDATE...FROM). Endpointy czytają gotową kolumnę + ORDER BY indexed DESC → <20ms.
|
||||
|
||||
scene_count = liczba scen z danym tagiem/performerem/studiem mających ≥1 ŻYWY
|
||||
playback_source (dead_at IS NULL) — dokładnie ta sama definicja co dotychczasowe
|
||||
live-aggregaty (has_live_playback filter w taxonomies.py / favorites.py).
|
||||
|
||||
Counts są do ~3h nieświeże — dla "(123)" przy filtrze i sortu "popular" bez znaczenia.
|
||||
"""
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
|
||||
revision: str = "0019_taxonomy_scene_counts"
|
||||
down_revision: str | None = "0018_movie_play_progress"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
_TABLES = ("tags", "performers", "studios")
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
for tbl in _TABLES:
|
||||
op.add_column(
|
||||
tbl,
|
||||
sa.Column(
|
||||
"scene_count", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
# DESC index — ORDER BY scene_count DESC (sortowanie "popular").
|
||||
op.create_index(
|
||||
f"ix_{tbl}_scene_count",
|
||||
tbl,
|
||||
[sa.text("scene_count DESC")],
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
for tbl in _TABLES:
|
||||
op.drop_index(f"ix_{tbl}_scene_count", table_name=tbl)
|
||||
op.drop_column(tbl, "scene_count")
|
||||
|
|
@ -27,7 +27,7 @@ from typing import Annotated
|
|||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.auth import require_api_key
|
||||
|
|
@ -77,25 +77,10 @@ def list_favorites(
|
|||
perf_ids = [perf.id for _, perf in rows]
|
||||
last_seen_by_perf = {fav.performer_id: fav.last_seen_at for fav, _ in rows}
|
||||
|
||||
# Batch: scene_count per performer — filtrujemy `has_live_playback` żeby badge
|
||||
# `N scenes` zgadzał się z tym co widać w PerformerScenes (mobile filtruje
|
||||
# `has_playback=true`). TPDB/StashDB sync wstawia metadata-only stubs które wlicz
|
||||
# by się w 2062 dla Aletta Ocean ale w profilu pokazuje tylko 499 oglądalnych.
|
||||
from sqlalchemy import and_, exists
|
||||
_scene_count_live_playback = exists().where(
|
||||
and_(
|
||||
PlaybackSource.scene_id == ScenePerformer.scene_id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
)
|
||||
scene_counts: dict = dict(
|
||||
session.execute(
|
||||
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id))
|
||||
.where(ScenePerformer.performer_id.in_(perf_ids))
|
||||
.where(_scene_count_live_playback)
|
||||
.group_by(ScenePerformer.performer_id)
|
||||
).all()
|
||||
)
|
||||
# scene_count: czytamy zdenormalizowany Performer.scene_count (refresh w tle przez
|
||||
# _job_refresh_taxonomy_counts) — ta sama definicja co przed (sceny z żywym
|
||||
# playback). Wcześniej grouped count z EXISTS playback per-request. Migracja 0019.
|
||||
scene_counts: dict = {perf.id: perf.scene_count for _, perf in rows}
|
||||
|
||||
# Batch: new_count per performer — sceny z created_at > last_seen_at favorite'a.
|
||||
# Każda performerka ma INNY last_seen_at, więc warunek per-row. Trick: GREATEST jest
|
||||
|
|
@ -240,22 +225,8 @@ def list_favorite_studios(
|
|||
studio_ids = [st.id for _, st in rows]
|
||||
last_seen_by_studio = {fav.studio_id: fav.last_seen_at for fav, _ in rows}
|
||||
|
||||
# has_live_playback filter — patrz `list_favorites` (performers) wyżej.
|
||||
from sqlalchemy import and_, exists
|
||||
_studio_count_live_playback = exists().where(
|
||||
and_(
|
||||
PlaybackSource.scene_id == Scene.id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
)
|
||||
scene_counts: dict = dict(
|
||||
session.execute(
|
||||
select(Scene.studio_id, func.count(Scene.id))
|
||||
.where(Scene.studio_id.in_(studio_ids))
|
||||
.where(_studio_count_live_playback)
|
||||
.group_by(Scene.studio_id)
|
||||
).all()
|
||||
)
|
||||
# scene_count: zdenormalizowany Studio.scene_count (refresh w tle, migracja 0019).
|
||||
scene_counts: dict = {st.id: st.scene_count for _, st in rows}
|
||||
|
||||
new_counts: dict = {}
|
||||
if studio_ids:
|
||||
|
|
|
|||
|
|
@ -40,6 +40,33 @@ router = APIRouter(prefix="/scenes", tags=["scenes"], dependencies=[Depends(requ
|
|||
|
||||
_VALID_SORTS = {"created_at", "release_date", "title", "studio"}
|
||||
|
||||
# TTL-cache dla count'u scen-z-żywym-playback (default lista bez filtra). Full-scan
|
||||
# 1.69M scen + EXISTS ~950ms; liczba zmienia się wolno i jest przybliżona (header
|
||||
# paginacji), więc 10-min cache w pamięci procesu API jest akceptowalny trade-off.
|
||||
_DEFAULT_COUNT_CACHE: dict = {"ts": 0.0, "val": 0}
|
||||
_DEFAULT_COUNT_TTL = 600.0
|
||||
|
||||
|
||||
def _default_scene_count(session: Session) -> int:
|
||||
import time as _time
|
||||
now = _time.monotonic()
|
||||
if _DEFAULT_COUNT_CACHE["val"] and (now - _DEFAULT_COUNT_CACHE["ts"]) < _DEFAULT_COUNT_TTL:
|
||||
return _DEFAULT_COUNT_CACHE["val"]
|
||||
count_query = select(func.count()).select_from(
|
||||
select(Scene.id).where(
|
||||
exists(
|
||||
select(1).where(
|
||||
PlaybackSource.scene_id == Scene.id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
)
|
||||
).subquery()
|
||||
)
|
||||
total = session.execute(count_query).scalar_one()
|
||||
_DEFAULT_COUNT_CACHE["ts"] = now
|
||||
_DEFAULT_COUNT_CACHE["val"] = total
|
||||
return total
|
||||
|
||||
|
||||
def _split_csv(raw: str | None) -> list[str]:
|
||||
if not raw:
|
||||
|
|
@ -283,18 +310,11 @@ def list_scenes(
|
|||
# przybliżoną górną granicą — co dla 400k scen i tak nie ma sensu reading dokładnie.
|
||||
if not include_stubs and not q and not studio_slug_list and not tags and not perf_id_strings:
|
||||
# Fast path: typowy default request (lista bez filtra) — count tylko po
|
||||
# has_playback (single EXISTS, dobrze zindeksowany).
|
||||
count_query = select(func.count()).select_from(
|
||||
select(Scene.id).where(
|
||||
exists(
|
||||
select(1).where(
|
||||
PlaybackSource.scene_id == Scene.id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
)
|
||||
).subquery()
|
||||
)
|
||||
total = session.execute(count_query).scalar_one()
|
||||
# has_playback (single EXISTS). Mimo to przy 1.69M scen full-scan z EXISTS
|
||||
# bierze ~950ms (zmierzone 2026-05-31), a liczba zmienia się wolno (ingest
|
||||
# ~kilkadziesiąt scen/h) i jest z definicji przybliżona. TTL-cache 10 min:
|
||||
# pierwszy request po wygaśnięciu płaci ~950ms, reszta czyta z pamięci.
|
||||
total = _default_scene_count(session)
|
||||
else:
|
||||
total = session.execute(select(func.count()).select_from(base.subquery())).scalar_one()
|
||||
|
||||
|
|
|
|||
|
|
@ -22,8 +22,7 @@ from app.db import get_session
|
|||
from app.models.movie import Movie, MovieTag
|
||||
from app.models.movie_playback_source import MoviePlaybackSource
|
||||
from app.models.performer import Performer
|
||||
from app.models.playback_source import PlaybackSource
|
||||
from app.models.scene import ScenePerformer, SceneTag
|
||||
from app.models.scene import ScenePerformer
|
||||
from app.models.studio import Studio
|
||||
from app.models.tag import Tag
|
||||
|
||||
|
|
@ -108,7 +107,8 @@ def list_tags(
|
|||
|
||||
if for_movies:
|
||||
# Movie tag count — zliczamy tylko Movies z ≥1 live MoviePlaybackSource.
|
||||
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce).
|
||||
# Tag-bez-żadnego-movie zwraca 0 (LEFT OUTER JOIN przez coalesce). Movies są
|
||||
# małe (~41k) więc live-aggregat OK; scenes path niżej używa denormalizacji.
|
||||
_movie_live = exists().where(
|
||||
and_(
|
||||
MoviePlaybackSource.movie_id == MovieTag.movie_id,
|
||||
|
|
@ -121,48 +121,55 @@ def list_tags(
|
|||
.group_by(MovieTag.tag_id)
|
||||
.subquery()
|
||||
)
|
||||
else:
|
||||
# has_live_playback filter — zliczamy tylko sceny które user faktycznie zobaczy
|
||||
# (TPDB/StashDB metadata-only stubs są do mergowania, nie do oglądania).
|
||||
_live_playback = exists().where(
|
||||
and_(
|
||||
PlaybackSource.scene_id == SceneTag.scene_id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
base = (
|
||||
select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||
.outerjoin(count_sub, count_sub.c.tag_id == Tag.id)
|
||||
)
|
||||
count_sub = (
|
||||
select(SceneTag.tag_id, func.count(SceneTag.scene_id).label("c"))
|
||||
.where(_live_playback)
|
||||
.group_by(SceneTag.tag_id)
|
||||
.subquery()
|
||||
)
|
||||
base = (
|
||||
select(Tag, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||
.outerjoin(count_sub, count_sub.c.tag_id == Tag.id)
|
||||
)
|
||||
if q:
|
||||
base = base.where(Tag.name.ilike(f"%{q}%"))
|
||||
if only_with_content:
|
||||
base = base.where(count_sub.c.tag_id.is_not(None))
|
||||
total = session.execute(
|
||||
select(func.count()).select_from(base.subquery())
|
||||
).scalar_one()
|
||||
if order in ("popular", "scene_count"):
|
||||
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
|
||||
else:
|
||||
ordered = base.order_by(Tag.name.asc())
|
||||
rows = session.execute(
|
||||
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||
).all()
|
||||
items = [
|
||||
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
|
||||
for t, c in rows
|
||||
]
|
||||
return TagListOut(items=items, total=total, page=page, per_page=per_page)
|
||||
|
||||
# Scenes path — czyta zdenormalizowany Tag.scene_count (refresh w tle przez
|
||||
# _job_refresh_taxonomy_counts). Wcześniej liczył agregat 6.3M scene_tags +
|
||||
# EXISTS playback per-request (~4.3s ×2). Patrz migracja 0019.
|
||||
base = select(Tag)
|
||||
if q:
|
||||
base = base.where(Tag.name.ilike(f"%{q}%"))
|
||||
if only_with_content:
|
||||
# exists() w outerjoin nie inner-joinowałby pustych tagów. Dlatego osobny
|
||||
# exists check: pasują tylko tagi z ≥1 w subquery.
|
||||
base = base.where(count_sub.c.tag_id.is_not(None))
|
||||
base = base.where(Tag.scene_count > 0)
|
||||
|
||||
total = session.execute(
|
||||
select(func.count()).select_from(base.subquery())
|
||||
).scalar_one()
|
||||
|
||||
if order in ("popular", "scene_count"):
|
||||
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Tag.name.asc())
|
||||
ordered = base.order_by(Tag.scene_count.desc(), Tag.name.asc())
|
||||
else:
|
||||
ordered = base.order_by(Tag.name.asc())
|
||||
|
||||
rows = session.execute(
|
||||
tags_page = session.execute(
|
||||
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||
).all()
|
||||
).scalars().all()
|
||||
|
||||
items = [
|
||||
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=int(c))
|
||||
for t, c in rows
|
||||
TagCount(id=t.id, name=t.name, slug=t.slug, scene_count=t.scene_count)
|
||||
for t in tags_page
|
||||
]
|
||||
return TagListOut(items=items, total=total, page=page, per_page=per_page)
|
||||
|
||||
|
|
@ -178,23 +185,9 @@ def list_performers(
|
|||
if order not in ("scene_count", "popular", "name"):
|
||||
raise HTTPException(status_code=400, detail="order must be 'scene_count' or 'name'")
|
||||
|
||||
# has_live_playback filter — patrz list_tags wyżej.
|
||||
_perf_live_playback = exists().where(
|
||||
and_(
|
||||
PlaybackSource.scene_id == ScenePerformer.scene_id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
)
|
||||
count_sub = (
|
||||
select(ScenePerformer.performer_id, func.count(ScenePerformer.scene_id).label("c"))
|
||||
.where(_perf_live_playback)
|
||||
.group_by(ScenePerformer.performer_id)
|
||||
.subquery()
|
||||
)
|
||||
base = (
|
||||
select(Performer, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||
.outerjoin(count_sub, count_sub.c.performer_id == Performer.id)
|
||||
)
|
||||
# Czyta zdenormalizowany Performer.scene_count (refresh w tle). Wcześniej agregat
|
||||
# 3M scene_performers + EXISTS playback per-request. Patrz migracja 0019.
|
||||
base = select(Performer)
|
||||
if q:
|
||||
base = base.where(Performer.name_normalized.ilike(f"%{q.lower()}%"))
|
||||
|
||||
|
|
@ -204,14 +197,14 @@ def list_performers(
|
|||
|
||||
if order in ("scene_count", "popular"):
|
||||
ordered = base.order_by(
|
||||
func.coalesce(count_sub.c.c, 0).desc(), Performer.canonical_name.asc()
|
||||
Performer.scene_count.desc(), Performer.canonical_name.asc()
|
||||
)
|
||||
else:
|
||||
ordered = base.order_by(Performer.canonical_name.asc())
|
||||
|
||||
rows = session.execute(
|
||||
perfs_page = session.execute(
|
||||
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||
).all()
|
||||
).scalars().all()
|
||||
|
||||
items = [
|
||||
PerformerCount(
|
||||
|
|
@ -219,9 +212,9 @@ def list_performers(
|
|||
canonical_name=p.canonical_name,
|
||||
slug=p.slug,
|
||||
gender=p.gender.value if p.gender else None,
|
||||
scene_count=int(c),
|
||||
scene_count=p.scene_count,
|
||||
)
|
||||
for p, c in rows
|
||||
for p in perfs_page
|
||||
]
|
||||
return PerformerListOut(items=items, total=total, page=page, per_page=per_page)
|
||||
|
||||
|
|
@ -248,6 +241,7 @@ def list_studios(
|
|||
raise HTTPException(status_code=400, detail="order must be 'name' or 'scene_count'")
|
||||
|
||||
if for_movies:
|
||||
# Movies małe (~41k) — live aggregat OK. Scenes path niżej = denormalizacja.
|
||||
_movie_live = exists().where(
|
||||
and_(
|
||||
MoviePlaybackSource.movie_id == Movie.id,
|
||||
|
|
@ -261,42 +255,50 @@ def list_studios(
|
|||
.group_by(Movie.studio_id)
|
||||
.subquery()
|
||||
)
|
||||
else:
|
||||
# has_live_playback filter — patrz list_tags wyżej.
|
||||
_studio_live_playback = exists().where(
|
||||
and_(
|
||||
PlaybackSource.scene_id == Scene.id,
|
||||
PlaybackSource.dead_at.is_(None),
|
||||
)
|
||||
base = (
|
||||
select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||
.outerjoin(count_sub, count_sub.c.studio_id == Studio.id)
|
||||
)
|
||||
count_sub = (
|
||||
select(Scene.studio_id, func.count(Scene.id).label("c"))
|
||||
.where(Scene.studio_id.is_not(None))
|
||||
.where(_studio_live_playback)
|
||||
.group_by(Scene.studio_id)
|
||||
.subquery()
|
||||
)
|
||||
base = (
|
||||
select(Studio, func.coalesce(count_sub.c.c, 0).label("scene_count"))
|
||||
.outerjoin(count_sub, count_sub.c.studio_id == Studio.id)
|
||||
)
|
||||
if q:
|
||||
base = base.where(Studio.name.ilike(f"%{q}%"))
|
||||
if only_with_content:
|
||||
base = base.where(count_sub.c.studio_id.is_not(None))
|
||||
total = session.execute(
|
||||
select(func.count()).select_from(base.subquery())
|
||||
).scalar_one()
|
||||
if order in ("scene_count", "popular"):
|
||||
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
|
||||
else:
|
||||
ordered = base.order_by(Studio.name_normalized.asc())
|
||||
rows = session.execute(
|
||||
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||
).all()
|
||||
items = [
|
||||
StudioCount(id=s.id, name=s.name, slug=s.slug, network=s.network, scene_count=int(c))
|
||||
for s, c in rows
|
||||
]
|
||||
return StudioListOut(items=items, total=total, page=page, per_page=per_page)
|
||||
|
||||
# Scenes path — czyta zdenormalizowany Studio.scene_count (refresh w tle).
|
||||
# Wcześniej agregat 1.69M scenes + EXISTS playback per-request. Patrz migracja 0019.
|
||||
base = select(Studio)
|
||||
if q:
|
||||
base = base.where(Studio.name.ilike(f"%{q}%"))
|
||||
if only_with_content:
|
||||
base = base.where(count_sub.c.studio_id.is_not(None))
|
||||
base = base.where(Studio.scene_count > 0)
|
||||
|
||||
total = session.execute(
|
||||
select(func.count()).select_from(base.subquery())
|
||||
).scalar_one()
|
||||
|
||||
if order in ("scene_count", "popular"):
|
||||
ordered = base.order_by(func.coalesce(count_sub.c.c, 0).desc(), Studio.name.asc())
|
||||
ordered = base.order_by(Studio.scene_count.desc(), Studio.name.asc())
|
||||
else:
|
||||
ordered = base.order_by(Studio.name_normalized.asc())
|
||||
|
||||
rows = session.execute(
|
||||
studios_page = session.execute(
|
||||
ordered.offset((page - 1) * per_page).limit(per_page)
|
||||
).all()
|
||||
).scalars().all()
|
||||
|
||||
items = [
|
||||
StudioCount(
|
||||
|
|
@ -304,9 +306,9 @@ def list_studios(
|
|||
name=s.name,
|
||||
slug=s.slug,
|
||||
network=s.network,
|
||||
scene_count=int(c),
|
||||
scene_count=s.scene_count,
|
||||
)
|
||||
for s, c in rows
|
||||
for s in studios_page
|
||||
]
|
||||
return StudioListOut(items=items, total=total, page=page, per_page=per_page)
|
||||
|
||||
|
|
|
|||
|
|
@ -88,6 +88,13 @@ class Settings(BaseSettings):
|
|||
sched_bulk_dedup_hours: int = Field(
|
||||
default=12, validation_alias="GOON_SCHED_BULK_DEDUP_HOURS"
|
||||
)
|
||||
# Taxonomy scene_count refresh — przelicza denormalizowane liczniki scen na
|
||||
# tags/performers/studios (hot-path /tags|/performers|/studios|/favorites czyta
|
||||
# gotową kolumnę zamiast agregować 6.3M scene_tags per-request). 3h cadence —
|
||||
# counts do tego stale, dla sortu "popular" + badge "(N)" bez znaczenia. 0 = off.
|
||||
sched_taxonomy_counts_hours: int = Field(
|
||||
default=3, validation_alias="GOON_SCHED_TAXONOMY_COUNTS_HOURS"
|
||||
)
|
||||
|
||||
# Hetzner Cloud bandwidth monitor — read-only API token (Security → API Tokens
|
||||
# w panelu Hetzner Cloud). Bez tokenu monitor wyłączony (warning w log).
|
||||
|
|
|
|||
|
|
@ -37,6 +37,11 @@ class Performer(UUIDPKMixin, TimestampMixin, Base):
|
|||
search_run_count: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, server_default="0"
|
||||
)
|
||||
# Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja
|
||||
# 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites.
|
||||
scene_count: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, server_default="0"
|
||||
)
|
||||
|
||||
|
||||
class PerformerAlias(Base):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import DateTime, Float, ForeignKey, String, UniqueConstraint, func
|
||||
from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, UniqueConstraint, func
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
|
|
@ -19,6 +19,11 @@ class Studio(UUIDPKMixin, TimestampMixin, Base):
|
|||
)
|
||||
network: Mapped[str | None] = mapped_column(String(256))
|
||||
homepage_url: Mapped[str | None] = mapped_column(String(512))
|
||||
# Denormalizowany licznik scen z żywym playback (refresh w tle). Patrz migracja
|
||||
# 0019 + _job_refresh_taxonomy_counts. Sortowanie "popular" + badge w favorites.
|
||||
scene_count: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, server_default="0"
|
||||
)
|
||||
|
||||
|
||||
class StudioAlias(Base):
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import uuid
|
||||
|
||||
from sqlalchemy import ForeignKey, String
|
||||
from sqlalchemy import ForeignKey, Integer, String
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
|
|
@ -16,3 +16,9 @@ class Tag(UUIDPKMixin, TimestampMixin, Base):
|
|||
UUID(as_uuid=True), ForeignKey("tags.id", ondelete="SET NULL")
|
||||
)
|
||||
description: Mapped[str | None] = mapped_column(String(1024))
|
||||
# Denormalizowany licznik scen z żywym playback (refresh w tle przez
|
||||
# _job_refresh_taxonomy_counts). Patrz migracja 0019. NIE źródło prawdy —
|
||||
# do sortu "popular" + badge "(N)" w filtrach.
|
||||
scene_count: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, server_default="0"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -130,6 +130,22 @@ def _job_movie_ingest() -> None:
|
|||
log.exception("[scheduler] movie ingest %s failed", name)
|
||||
|
||||
|
||||
def _job_refresh_taxonomy_counts() -> None:
|
||||
"""Przelicza denormalizowane scene_count na tags/performers/studios.
|
||||
|
||||
Hot-path /tags|/performers|/studios|/favorites czyta gotową kolumnę zamiast
|
||||
agregować 6.3M scene_tags per-request (~4.3s → <20ms). Patrz migracja 0019 +
|
||||
app/scheduler/taxonomy_counts.py.
|
||||
"""
|
||||
log.info("[scheduler] taxonomy counts refresh starting")
|
||||
try:
|
||||
from app.scheduler.taxonomy_counts import refresh_taxonomy_counts
|
||||
changed = refresh_taxonomy_counts()
|
||||
log.info("[scheduler] taxonomy counts refresh done: %s", changed)
|
||||
except Exception:
|
||||
log.exception("[scheduler] taxonomy counts refresh failed")
|
||||
|
||||
|
||||
def _job_bulk_dedup_performers() -> None:
|
||||
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
|
||||
resolver-time scoring nie złapał.
|
||||
|
|
@ -257,6 +273,17 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
|||
)
|
||||
log.info("scheduler: movie-ingest every %dh", cfg["movie_ingest_hours"])
|
||||
|
||||
if cfg.get("taxonomy_counts_hours"):
|
||||
sched.add_job(
|
||||
_job_refresh_taxonomy_counts,
|
||||
IntervalTrigger(hours=cfg["taxonomy_counts_hours"], start_date=INTERVAL_ANCHOR),
|
||||
id="taxonomy_counts",
|
||||
replace_existing=True,
|
||||
max_instances=1,
|
||||
coalesce=True,
|
||||
)
|
||||
log.info("scheduler: taxonomy-counts refresh every %dh", cfg["taxonomy_counts_hours"])
|
||||
|
||||
if cfg.get("performer_continuous_seconds"):
|
||||
refresh_days = cfg.get("performer_continuous_refresh_days") or 30
|
||||
seconds = cfg["performer_continuous_seconds"]
|
||||
|
|
@ -296,4 +323,7 @@ DEFAULT_CONFIG: dict[str, Any] = {
|
|||
# każdego co 30 dni.
|
||||
"performer_continuous_seconds": 15,
|
||||
"performer_continuous_refresh_days": 30,
|
||||
# Taxonomy scene_count refresh — denormalizacja liczników dla /tags|/performers|
|
||||
# /studios|/favorites. Co 3h: counts do tego stale, dla sortu "popular" bez znaczenia.
|
||||
"taxonomy_counts_hours": 3,
|
||||
}
|
||||
|
|
|
|||
89
app/scheduler/taxonomy_counts.py
Normal file
89
app/scheduler/taxonomy_counts.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
"""Refresh denormalizowanych `scene_count` na tags / performers / studios.
|
||||
|
||||
Liczniki są utrzymywane w tle (zamiast liczone per-request) bo agregacja po 6.3M
|
||||
scene_tags / 3M scene_performers z EXISTS do 1.15M playback_sources zajmuje ~4.3s —
|
||||
nie do zaakceptowania w hot-path UI (/tags, /performers, /studios, /favorites).
|
||||
|
||||
Definicja (identyczna z dotychczasowym has_live_playback filtrem w taxonomies.py):
|
||||
scene_count = liczba scen z danym tagiem/performerem/studiem mających ≥1
|
||||
playback_source z dead_at IS NULL.
|
||||
|
||||
Każdy UPDATE robi pełny LEFT JOIN (tag/performer/studio) ⨝ agregat → ustawia 0 dla
|
||||
sierot. `IS DISTINCT FROM` pomija przepisywanie niezmienionych wierszy (mniej WAL/bloat).
|
||||
Całość ~5-10s, leci co kilka godzin — counts do tego stale, co dla sortu "popular" i
|
||||
badge "(N)" jest bez znaczenia.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import session_scope
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Wspólny predykat: scena ma ≥1 żywy playback_source.
|
||||
_LIVE = (
|
||||
"EXISTS (SELECT 1 FROM playback_sources ps "
|
||||
"WHERE ps.scene_id = {scene_col} AND ps.dead_at IS NULL)"
|
||||
)
|
||||
|
||||
_TAGS_SQL = text(
|
||||
f"""
|
||||
UPDATE tags t SET scene_count = COALESCE(a.c, 0)
|
||||
FROM tags base
|
||||
LEFT JOIN (
|
||||
SELECT st.tag_id, count(*) AS c
|
||||
FROM scene_tags st
|
||||
WHERE {_LIVE.format(scene_col="st.scene_id")}
|
||||
GROUP BY st.tag_id
|
||||
) a ON a.tag_id = base.id
|
||||
WHERE t.id = base.id AND t.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
|
||||
"""
|
||||
)
|
||||
|
||||
_PERFORMERS_SQL = text(
|
||||
f"""
|
||||
UPDATE performers p SET scene_count = COALESCE(a.c, 0)
|
||||
FROM performers base
|
||||
LEFT JOIN (
|
||||
SELECT sp.performer_id, count(*) AS c
|
||||
FROM scene_performers sp
|
||||
WHERE {_LIVE.format(scene_col="sp.scene_id")}
|
||||
GROUP BY sp.performer_id
|
||||
) a ON a.performer_id = base.id
|
||||
WHERE p.id = base.id AND p.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
|
||||
"""
|
||||
)
|
||||
|
||||
_STUDIOS_SQL = text(
|
||||
f"""
|
||||
UPDATE studios s SET scene_count = COALESCE(a.c, 0)
|
||||
FROM studios base
|
||||
LEFT JOIN (
|
||||
SELECT sc.studio_id, count(*) AS c
|
||||
FROM scenes sc
|
||||
WHERE sc.studio_id IS NOT NULL AND {_LIVE.format(scene_col="sc.id")}
|
||||
GROUP BY sc.studio_id
|
||||
) a ON a.studio_id = base.id
|
||||
WHERE s.id = base.id AND s.scene_count IS DISTINCT FROM COALESCE(a.c, 0)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def refresh_taxonomy_counts() -> dict[str, int]:
|
||||
"""Przelicza scene_count dla tags/performers/studios. Zwraca rowcount per tabela
|
||||
(ile wierszy faktycznie się zmieniło)."""
|
||||
out: dict[str, int] = {}
|
||||
with session_scope() as session:
|
||||
for name, stmt in (
|
||||
("tags", _TAGS_SQL),
|
||||
("performers", _PERFORMERS_SQL),
|
||||
("studios", _STUDIOS_SQL),
|
||||
):
|
||||
res = session.execute(stmt)
|
||||
out[name] = res.rowcount or 0
|
||||
# Commit per-tabela — długie transakcje trzymałyby locki na hot tables.
|
||||
session.commit()
|
||||
return out
|
||||
|
|
@ -164,6 +164,8 @@ def run_forever() -> int:
|
|||
# Bulk-dedup performers — safety net dla duplikatów które resolver
|
||||
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
|
||||
"bulk_dedup_hours": getattr(settings, "sched_bulk_dedup_hours", 12) or None,
|
||||
# Taxonomy scene_count refresh — denormalizacja liczników (perf fix 0019).
|
||||
"taxonomy_counts_hours": getattr(settings, "sched_taxonomy_counts_hours", 3) or None,
|
||||
}
|
||||
sched = build_scheduler(cfg)
|
||||
log.info("worker scheduled mode starting (jobs=%d)", len(sched.get_jobs()))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue