goon/scripts/find_underfilled_performers.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

98 lines
3.7 KiB
Python

"""Znajdź performerów którym brakuje TPDB lub StashDB external_ref, mimo że mają sporo scen.
Use-case: ten sam bug co Kitana Montana — performer ma stashdb ref ale nie tpdb (lub odwrotnie),
więc performer-driven nigdy nie zaciąga z tej drugiej bazy. Skutek: niedoszacowanie sceny per
performer + zerowa szansa na cross-source merge.
Output: lista (canonical_name, scene_count, has_tpdb, has_stashdb) posortowana po scene_count desc.
"""
from __future__ import annotations
import sys
from sqlalchemy import case, func, select
from app.db import session_scope
from app.models.performer import Performer, PerformerExternalRef
from app.models.scene import ScenePerformer
from app.models.source import Source, SourceKind
def main(top_n: int = 50) -> None:
with session_scope() as session:
tpdb_src = session.execute(
select(Source.id).where(Source.kind == SourceKind.tpdb)
).scalar_one()
stashdb_src = session.execute(
select(Source.id).where(Source.kind == SourceKind.stashdb)
).scalar_one()
# subq: dla każdego performera czy ma TPDB / StashDB ref
has_tpdb = (
select(PerformerExternalRef.performer_id)
.where(PerformerExternalRef.source_id == tpdb_src)
.distinct()
).subquery()
has_stashdb = (
select(PerformerExternalRef.performer_id)
.where(PerformerExternalRef.source_id == stashdb_src)
.distinct()
).subquery()
# main query
rows = session.execute(
select(
Performer.id,
Performer.canonical_name,
func.count(ScenePerformer.scene_id).label("c"),
case((has_tpdb.c.performer_id.isnot(None), 1), else_=0).label("has_tpdb"),
case((has_stashdb.c.performer_id.isnot(None), 1), else_=0).label("has_stashdb"),
)
.outerjoin(ScenePerformer, ScenePerformer.performer_id == Performer.id)
.outerjoin(has_tpdb, has_tpdb.c.performer_id == Performer.id)
.outerjoin(has_stashdb, has_stashdb.c.performer_id == Performer.id)
.group_by(
Performer.id,
Performer.canonical_name,
has_tpdb.c.performer_id,
has_stashdb.c.performer_id,
)
.order_by(func.count(ScenePerformer.scene_id).desc())
.limit(top_n * 5) # buffer to filter
).all()
# bucket
only_stashdb: list[tuple] = []
only_tpdb: list[tuple] = []
no_canonical: list[tuple] = []
both: list[tuple] = []
for pid, name, count, has_t, has_s in rows:
if has_t and has_s:
both.append((name, count))
elif has_s and not has_t:
only_stashdb.append((name, count))
elif has_t and not has_s:
only_tpdb.append((name, count))
else:
no_canonical.append((name, count))
print(f"=== Top {top_n*5} performers by scene_count ===")
print(f" both canonical refs: {len(both)}")
print(f" only stashdb (miss TPDB): {len(only_stashdb)}")
print(f" only tpdb (miss StashDB): {len(only_tpdb)}")
print(f" no canonical refs: {len(no_canonical)}")
print(f"\n--- Top {top_n} missing TPDB (have stashdb) ---")
for name, c in only_stashdb[:top_n]:
print(f" {c:5d} {name}")
print(f"\n--- Top {top_n} missing StashDB (have tpdb) ---")
for name, c in only_tpdb[:top_n]:
print(f" {c:5d} {name}")
print(f"\n--- Top {top_n} no canonical refs at all (probably scraper-only or alias issue) ---")
for name, c in no_canonical[:top_n]:
print(f" {c:5d} {name}")
if __name__ == "__main__":
n = int(sys.argv[1]) if len(sys.argv) > 1 else 30
main(n)