fix(tags): merge <base>2 numbered-duplicate tags + prevent regeneration

TPDB taxonomy emits numbered-duplicate tags (name "Bubble Butt2"); slugify
yields "bubble-butt2" (no separator before digit), so resolve_tag created a
separate tag alongside "bubble-butt". Tube scenes inherited the dup via
scene-merge → 75 pairs, ~10k scene_tags on the wrong tag.

- resolve_tag: canonicalize "<base>2" -> "<base>" when base exists (handles
  current + future; trailing-"2"+alpha guard leaves milf-30/teen18 intact)
- scripts/merge_dup2_tags.py: one-off bulk merge (scene_tags + movie_tags +
  blacklist) and taxonomy-count refresh

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-06 23:18:44 +02:00
parent 3cbfb1d490
commit fad72e9cd6
2 changed files with 126 additions and 0 deletions

View file

@ -12,6 +12,22 @@ from app.normalize.scenes import NormalizedTag
from app.normalize.text import slugify from app.normalize.text import slugify
def _canonical_dup2_slug(session: Session, slug: str) -> str:
"""Kanonizuje numbered-duplicate slug `<base>2` → `<base>`.
TPDB taxonomy emituje zduplikowane tagi z suffixem `2` (name "Bubble Butt2").
`slugify` daje `bubble-butt2` (bez separatora przed cyfrą) bez tego osobny tag
obok `bubble-butt` z tysiącami scen (dup-tag bug 2026-06-06; backfill
scripts/merge_dup2_tags.py). Gdy slug kończy się literą+"2" i baza istnieje jako
tag, używamy bazy. Warunek `[-2].isalpha()` wyklucza legit sufiksy gdzie cyfra jest
znacząca (milf-30, teen18 nie kończą się "2"; chroni też przed "...22")."""
if len(slug) > 1 and slug[-1] == "2" and slug[-2].isalpha():
base = slug[:-1]
if session.execute(select(Tag.id).where(Tag.slug == base)).first():
return base
return slug
def resolve_tag(session: Session, *, norm: NormalizedTag) -> Tag | None: def resolve_tag(session: Session, *, norm: NormalizedTag) -> Tag | None:
slug = norm.slug or slugify(norm.name) slug = norm.slug or slugify(norm.name)
# DB columns: name VARCHAR(128), slug VARCHAR(128). Scraper occasionally # DB columns: name VARCHAR(128), slug VARCHAR(128). Scraper occasionally
@ -20,6 +36,7 @@ def resolve_tag(session: Session, *, norm: NormalizedTag) -> Tag | None:
# the whole ingest batch. # the whole ingest batch.
if len(slug) > 120: if len(slug) > 120:
return None return None
slug = _canonical_dup2_slug(session, slug)
tag = session.execute(select(Tag).where(Tag.slug == slug)).scalar_one_or_none() tag = session.execute(select(Tag).where(Tag.slug == slug)).scalar_one_or_none()
if tag is not None: if tag is not None:
return tag return tag

109
scripts/merge_dup2_tags.py Normal file
View file

@ -0,0 +1,109 @@
"""Bulk-merge numbered-duplicate tagów: `<base>2` → `<base>`.
Kontekst (2026-06-06): TPDB taxonomy emituje zduplikowane tagi z suffixem `2`
(np. name "Bubble Butt2"). `slugify` daje `bubble-butt2` (bez separatora przed
cyfrą), więc `resolve_tag` tworzy OSOBNY tag obok `bubble-butt`. Tubowe sceny
dziedziczą dup-tag przez scene-merge 75 par, ~10k scene_tags na złym tagu.
Ten skrypt scala każdy `<base>2` (gdy `<base>` istnieje jako osobny tag) do bazy:
scene_tags + movie_tags + blacklisted_tags przepisane (z deduplikacją na PK),
dup-tag skasowany. Na koniec refresh zdenormalizowanych scene_count.
Prewencja regeneracji żyje w `app/resolve/tag_resolver.py` (_canonical_dup2_slug).
Użycie:
python scripts/merge_dup2_tags.py [--dry-run]
"""
from __future__ import annotations
import argparse
import logging
from sqlalchemy import text
from app.db import session_scope
log = logging.getLogger("merge_dup2_tags")
# Para = tag o slugu kończącym się literą+"2", którego baza (slug bez ostatniego
# znaku) istnieje jako inny tag. `[a-z]2$` wyklucza wieloznakowe sufiksy (teen18,
# milf-30, vr11111111) — tam ostatni znak nie jest "2" albo przedostatni to cyfra.
_DUP_MAP_SQL = """
SELECT d.id AS drop_id, d.slug AS drop_slug, d.scene_count AS drop_cnt,
b.id AS keep_id, b.slug AS keep_slug, b.scene_count AS keep_cnt
FROM tags d
JOIN tags b ON b.slug = left(d.slug, length(d.slug) - 1)
WHERE d.slug ~ '[a-z]2$'
ORDER BY d.scene_count DESC
"""
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
with session_scope() as s:
pairs = list(s.execute(text(_DUP_MAP_SQL)))
log.info("found %d dup pairs", len(pairs))
for p in pairs:
log.info(" %-32s (%5d) -> %-30s (%5d)", p.drop_slug, p.drop_cnt, p.keep_slug, p.keep_cnt)
if not pairs:
return
# Temp tabela mapująca drop→keep — jeden set-based przebieg na wszystkie pary.
s.execute(text("CREATE TEMP TABLE _dup_map ON COMMIT DROP AS " + _DUP_MAP_SQL))
if args.dry_run:
n = s.execute(text("SELECT count(*) FROM scene_tags st JOIN _dup_map m ON st.tag_id=m.drop_id")).scalar()
nm = s.execute(text("SELECT count(*) FROM movie_tags mt JOIN _dup_map m ON mt.tag_id=m.drop_id")).scalar()
log.info("DRY-RUN: would touch %d scene_tags + %d movie_tags across %d pairs", n, nm, len(pairs))
s.rollback()
return
# 1) scene_tags: przepisz drop→keep tam gdzie scena NIE ma już keep (PK collision);
# resztę (sceny mające oba tagi) usunie CASCADE przy DELETE FROM tags.
r1 = s.execute(text("""
UPDATE scene_tags st SET tag_id = m.keep_id
FROM _dup_map m
WHERE st.tag_id = m.drop_id
AND NOT EXISTS (SELECT 1 FROM scene_tags k
WHERE k.scene_id = st.scene_id AND k.tag_id = m.keep_id)
"""))
log.info("scene_tags migrated: %d", r1.rowcount)
# 2) movie_tags: analogicznie
r2 = s.execute(text("""
UPDATE movie_tags mt SET tag_id = m.keep_id
FROM _dup_map m
WHERE mt.tag_id = m.drop_id
AND NOT EXISTS (SELECT 1 FROM movie_tags k
WHERE k.movie_id = mt.movie_id AND k.tag_id = m.keep_id)
"""))
log.info("movie_tags migrated: %d", r2.rowcount)
# 3) blacklisted_tags: przenieś blacklist z dup na bazę (gdyby ktoś zbanował dup-tag),
# żeby DELETE+CASCADE nie zgubił bana. ON CONFLICT pomija gdy baza już zbanowana.
r3 = s.execute(text("""
INSERT INTO blacklisted_tags (tag_id)
SELECT m.keep_id FROM blacklisted_tags bt JOIN _dup_map m ON bt.tag_id = m.drop_id
ON CONFLICT DO NOTHING
"""))
if r3.rowcount:
log.info("blacklist refs moved: %d", r3.rowcount)
# 4) Skasuj dup-tagi. CASCADE sprząta pozostałe (kolizyjne) scene_tags/movie_tags/blacklist.
rd = s.execute(text("DELETE FROM tags WHERE id IN (SELECT drop_id FROM _dup_map)"))
log.info("dup tags deleted: %d", rd.rowcount)
s.commit()
# 5) Refresh zdenormalizowanych scene_count (hot-path /tags czyta gotową kolumnę).
from app.scheduler.taxonomy_counts import refresh_taxonomy_counts
changed = refresh_taxonomy_counts()
log.info("taxonomy counts refreshed: %s", changed)
if __name__ == "__main__":
main()