fix(tags): merge <base>2 numbered-duplicate tags + prevent regeneration
TPDB taxonomy emits numbered-duplicate tags (name "Bubble Butt2"); slugify yields "bubble-butt2" (no separator before digit), so resolve_tag created a separate tag alongside "bubble-butt". Tube scenes inherited the dup via scene-merge → 75 pairs, ~10k scene_tags on the wrong tag. - resolve_tag: canonicalize "<base>2" -> "<base>" when base exists (handles current + future; trailing-"2"+alpha guard leaves milf-30/teen18 intact) - scripts/merge_dup2_tags.py: one-off bulk merge (scene_tags + movie_tags + blacklist) and taxonomy-count refresh Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
3cbfb1d490
commit
fad72e9cd6
2 changed files with 126 additions and 0 deletions
|
|
@ -12,6 +12,22 @@ from app.normalize.scenes import NormalizedTag
|
|||
from app.normalize.text import slugify
|
||||
|
||||
|
||||
def _canonical_dup2_slug(session: Session, slug: str) -> str:
|
||||
"""Kanonizuje numbered-duplicate slug `<base>2` → `<base>`.
|
||||
|
||||
TPDB taxonomy emituje zduplikowane tagi z suffixem `2` (name "Bubble Butt2").
|
||||
`slugify` daje `bubble-butt2` (bez separatora przed cyfrą) → bez tego osobny tag
|
||||
obok `bubble-butt` z tysiącami scen (dup-tag bug 2026-06-06; backfill
|
||||
scripts/merge_dup2_tags.py). Gdy slug kończy się literą+"2" i baza istnieje jako
|
||||
tag, używamy bazy. Warunek `[-2].isalpha()` wyklucza legit sufiksy gdzie cyfra jest
|
||||
znacząca (milf-30, teen18 — nie kończą się "2"; chroni też przed "...22")."""
|
||||
if len(slug) > 1 and slug[-1] == "2" and slug[-2].isalpha():
|
||||
base = slug[:-1]
|
||||
if session.execute(select(Tag.id).where(Tag.slug == base)).first():
|
||||
return base
|
||||
return slug
|
||||
|
||||
|
||||
def resolve_tag(session: Session, *, norm: NormalizedTag) -> Tag | None:
|
||||
slug = norm.slug or slugify(norm.name)
|
||||
# DB columns: name VARCHAR(128), slug VARCHAR(128). Scraper occasionally
|
||||
|
|
@ -20,6 +36,7 @@ def resolve_tag(session: Session, *, norm: NormalizedTag) -> Tag | None:
|
|||
# the whole ingest batch.
|
||||
if len(slug) > 120:
|
||||
return None
|
||||
slug = _canonical_dup2_slug(session, slug)
|
||||
tag = session.execute(select(Tag).where(Tag.slug == slug)).scalar_one_or_none()
|
||||
if tag is not None:
|
||||
return tag
|
||||
|
|
|
|||
109
scripts/merge_dup2_tags.py
Normal file
109
scripts/merge_dup2_tags.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
"""Bulk-merge numbered-duplicate tagów: `<base>2` → `<base>`.
|
||||
|
||||
Kontekst (2026-06-06): TPDB taxonomy emituje zduplikowane tagi z suffixem `2`
|
||||
(np. name "Bubble Butt2"). `slugify` daje `bubble-butt2` (bez separatora przed
|
||||
cyfrą), więc `resolve_tag` tworzy OSOBNY tag obok `bubble-butt`. Tubowe sceny
|
||||
dziedziczą dup-tag przez scene-merge → 75 par, ~10k scene_tags na złym tagu.
|
||||
|
||||
Ten skrypt scala każdy `<base>2` (gdy `<base>` istnieje jako osobny tag) do bazy:
|
||||
scene_tags + movie_tags + blacklisted_tags przepisane (z deduplikacją na PK),
|
||||
dup-tag skasowany. Na koniec refresh zdenormalizowanych scene_count.
|
||||
|
||||
Prewencja regeneracji żyje w `app/resolve/tag_resolver.py` (_canonical_dup2_slug).
|
||||
|
||||
Użycie:
|
||||
python scripts/merge_dup2_tags.py [--dry-run]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import session_scope
|
||||
|
||||
log = logging.getLogger("merge_dup2_tags")
|
||||
|
||||
# Para = tag o slugu kończącym się literą+"2", którego baza (slug bez ostatniego
|
||||
# znaku) istnieje jako inny tag. `[a-z]2$` wyklucza wieloznakowe sufiksy (teen18,
|
||||
# milf-30, vr11111111) — tam ostatni znak nie jest "2" albo przedostatni to cyfra.
|
||||
_DUP_MAP_SQL = """
|
||||
SELECT d.id AS drop_id, d.slug AS drop_slug, d.scene_count AS drop_cnt,
|
||||
b.id AS keep_id, b.slug AS keep_slug, b.scene_count AS keep_cnt
|
||||
FROM tags d
|
||||
JOIN tags b ON b.slug = left(d.slug, length(d.slug) - 1)
|
||||
WHERE d.slug ~ '[a-z]2$'
|
||||
ORDER BY d.scene_count DESC
|
||||
"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
with session_scope() as s:
|
||||
pairs = list(s.execute(text(_DUP_MAP_SQL)))
|
||||
log.info("found %d dup pairs", len(pairs))
|
||||
for p in pairs:
|
||||
log.info(" %-32s (%5d) -> %-30s (%5d)", p.drop_slug, p.drop_cnt, p.keep_slug, p.keep_cnt)
|
||||
|
||||
if not pairs:
|
||||
return
|
||||
|
||||
# Temp tabela mapująca drop→keep — jeden set-based przebieg na wszystkie pary.
|
||||
s.execute(text("CREATE TEMP TABLE _dup_map ON COMMIT DROP AS " + _DUP_MAP_SQL))
|
||||
|
||||
if args.dry_run:
|
||||
n = s.execute(text("SELECT count(*) FROM scene_tags st JOIN _dup_map m ON st.tag_id=m.drop_id")).scalar()
|
||||
nm = s.execute(text("SELECT count(*) FROM movie_tags mt JOIN _dup_map m ON mt.tag_id=m.drop_id")).scalar()
|
||||
log.info("DRY-RUN: would touch %d scene_tags + %d movie_tags across %d pairs", n, nm, len(pairs))
|
||||
s.rollback()
|
||||
return
|
||||
|
||||
# 1) scene_tags: przepisz drop→keep tam gdzie scena NIE ma już keep (PK collision);
|
||||
# resztę (sceny mające oba tagi) usunie CASCADE przy DELETE FROM tags.
|
||||
r1 = s.execute(text("""
|
||||
UPDATE scene_tags st SET tag_id = m.keep_id
|
||||
FROM _dup_map m
|
||||
WHERE st.tag_id = m.drop_id
|
||||
AND NOT EXISTS (SELECT 1 FROM scene_tags k
|
||||
WHERE k.scene_id = st.scene_id AND k.tag_id = m.keep_id)
|
||||
"""))
|
||||
log.info("scene_tags migrated: %d", r1.rowcount)
|
||||
|
||||
# 2) movie_tags: analogicznie
|
||||
r2 = s.execute(text("""
|
||||
UPDATE movie_tags mt SET tag_id = m.keep_id
|
||||
FROM _dup_map m
|
||||
WHERE mt.tag_id = m.drop_id
|
||||
AND NOT EXISTS (SELECT 1 FROM movie_tags k
|
||||
WHERE k.movie_id = mt.movie_id AND k.tag_id = m.keep_id)
|
||||
"""))
|
||||
log.info("movie_tags migrated: %d", r2.rowcount)
|
||||
|
||||
# 3) blacklisted_tags: przenieś blacklist z dup na bazę (gdyby ktoś zbanował dup-tag),
|
||||
# żeby DELETE+CASCADE nie zgubił bana. ON CONFLICT pomija gdy baza już zbanowana.
|
||||
r3 = s.execute(text("""
|
||||
INSERT INTO blacklisted_tags (tag_id)
|
||||
SELECT m.keep_id FROM blacklisted_tags bt JOIN _dup_map m ON bt.tag_id = m.drop_id
|
||||
ON CONFLICT DO NOTHING
|
||||
"""))
|
||||
if r3.rowcount:
|
||||
log.info("blacklist refs moved: %d", r3.rowcount)
|
||||
|
||||
# 4) Skasuj dup-tagi. CASCADE sprząta pozostałe (kolizyjne) scene_tags/movie_tags/blacklist.
|
||||
rd = s.execute(text("DELETE FROM tags WHERE id IN (SELECT drop_id FROM _dup_map)"))
|
||||
log.info("dup tags deleted: %d", rd.rowcount)
|
||||
s.commit()
|
||||
|
||||
# 5) Refresh zdenormalizowanych scene_count (hot-path /tags czyta gotową kolumnę).
|
||||
from app.scheduler.taxonomy_counts import refresh_taxonomy_counts
|
||||
changed = refresh_taxonomy_counts()
|
||||
log.info("taxonomy counts refreshed: %s", changed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue