From 43f7e1f7b247b715e2f724e4cea4aeb627d5bb02 Mon Sep 17 00:00:00 2001
From: jtrzupek <jtrzupek@gmail.com>
Date: Sun, 7 Jun 2026 21:10:31 +0200
Subject: [PATCH] =?UTF-8?q?perf(scenes):=20literal=20tag=5Fid=20in=20filte?=
 =?UTF-8?q?r=20=E2=80=94=204-12s=20tag=20lists=20->=20~20ms?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tag-filtered scene lists (e.g. blowjob + has_playback) took 4-12s. Root cause:
the filter joined scene_tags->tags on slug, so the actual tag_id was opaque to
the planner at plan time. It fell back to average per-tag cardinality
(8.4M/11541 ≈ 726) instead of the real 273k, chose to materialize ALL matching
scene_tags + check playback per row, then top-N sort.

Fix: resolve slug->tag_id in the app and filter on a LITERAL tag_id (no slug
join). With a constant, the planner uses MCV stats, knows the tag is huge, and
walks ix_scenes_created_at_desc probing scene_tags/playback per scene, stopping
at the page limit. Verified: blowjob list 3300ms -> 18ms (EXPLAIN), HTTP 4-12s ->
47ms. Unknown slug short-circuits to empty. (Pairs with the raised tag_id
statistics target so mid-tier tags also get correct estimates.)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 app/api/scenes.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/app/api/scenes.py b/app/api/scenes.py
index 7bf1ad9..6c025fb 100644
--- a/app/api/scenes.py
+++ b/app/api/scenes.py
@@ -8,7 +8,7 @@ from typing import Annotated
 
 from fastapi import APIRouter, Depends, HTTPException, Query, status
 from pydantic import BaseModel
-from sqlalchemy import distinct, exists, func, literal_column, select
+from sqlalchemy import distinct, exists, false, func, literal_column, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 
@@ -182,15 +182,31 @@ def list_scenes(
     tag_slug_list = _split_csv(tags)
     # AND między tagami: scena musi mieć WSZYSTKIE zaznaczone tagi. Każdy slug → osobny
     # exists() — zaznaczanie kolejnych filtrów zawęża wyniki, jak intuicja użytkownika.
-    for slug in tag_slug_list:
-        base = base.where(
-            exists(
-                select(1)
-                .select_from(SceneTag)
-                .join(Tag, Tag.id == SceneTag.tag_id)
-                .where(SceneTag.scene_id == Scene.id, Tag.slug == slug)
-            )
+    #
+    # PERF (2026-06-07): resolvujemy slug→tag_id w aplikacji i filtrujemy po LITERALNYM
+    # tag_id (NIE JOIN po Tag.slug). Z literałem planner zna kardynalność tagu ze
+    # statystyk (MCV) → dla popularnych tagów (blowjob ~273k scen) wybiera index-walk po
+    # ix_scenes_created_at_desc zamiast materializować wszystkie scene_tags. Slug-JOIN
+    # ukrywał tag_id przed plannerem → używał średniej (8.4M/11541≈726) → zły plan
+    # (4-12s). Z literałem: ~20ms. Zob. też _build... light mode.
+    if tag_slug_list:
+        id_by_slug = dict(
+            session.execute(
+                select(Tag.slug, Tag.id).where(Tag.slug.in_(tag_slug_list))
+            ).all()
         )
+        for slug in tag_slug_list:
+            tag_id = id_by_slug.get(slug)
+            if tag_id is None:
+                base = base.where(false())  # nieznany slug → brak wyników
+                break
+            base = base.where(
+                exists(
+                    select(1)
+                    .select_from(SceneTag)
+                    .where(SceneTag.scene_id == Scene.id, SceneTag.tag_id == tag_id)
+                )
+            )
 
     perf_id_strings = _split_csv(performer_ids)
     if perf_id_strings: