Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
98 lines
3.7 KiB
Python
98 lines
3.7 KiB
Python
"""Znajdź performerów którym brakuje TPDB lub StashDB external_ref, mimo że mają sporo scen.
|
|
|
|
Use-case: ten sam bug co Kitana Montana — performer ma stashdb ref ale nie tpdb (lub odwrotnie),
|
|
więc performer-driven nigdy nie zaciąga z tej drugiej bazy. Skutek: niedoszacowanie sceny per
|
|
performer + zerowa szansa na cross-source merge.
|
|
|
|
Output: lista (canonical_name, scene_count, has_tpdb, has_stashdb) posortowana po scene_count desc.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
|
|
from sqlalchemy import case, func, select
|
|
|
|
from app.db import session_scope
|
|
from app.models.performer import Performer, PerformerExternalRef
|
|
from app.models.scene import ScenePerformer
|
|
from app.models.source import Source, SourceKind
|
|
|
|
|
|
def main(top_n: int = 50) -> None:
|
|
with session_scope() as session:
|
|
tpdb_src = session.execute(
|
|
select(Source.id).where(Source.kind == SourceKind.tpdb)
|
|
).scalar_one()
|
|
stashdb_src = session.execute(
|
|
select(Source.id).where(Source.kind == SourceKind.stashdb)
|
|
).scalar_one()
|
|
|
|
# subq: dla każdego performera czy ma TPDB / StashDB ref
|
|
has_tpdb = (
|
|
select(PerformerExternalRef.performer_id)
|
|
.where(PerformerExternalRef.source_id == tpdb_src)
|
|
.distinct()
|
|
).subquery()
|
|
has_stashdb = (
|
|
select(PerformerExternalRef.performer_id)
|
|
.where(PerformerExternalRef.source_id == stashdb_src)
|
|
.distinct()
|
|
).subquery()
|
|
|
|
# main query
|
|
rows = session.execute(
|
|
select(
|
|
Performer.id,
|
|
Performer.canonical_name,
|
|
func.count(ScenePerformer.scene_id).label("c"),
|
|
case((has_tpdb.c.performer_id.isnot(None), 1), else_=0).label("has_tpdb"),
|
|
case((has_stashdb.c.performer_id.isnot(None), 1), else_=0).label("has_stashdb"),
|
|
)
|
|
.outerjoin(ScenePerformer, ScenePerformer.performer_id == Performer.id)
|
|
.outerjoin(has_tpdb, has_tpdb.c.performer_id == Performer.id)
|
|
.outerjoin(has_stashdb, has_stashdb.c.performer_id == Performer.id)
|
|
.group_by(
|
|
Performer.id,
|
|
Performer.canonical_name,
|
|
has_tpdb.c.performer_id,
|
|
has_stashdb.c.performer_id,
|
|
)
|
|
.order_by(func.count(ScenePerformer.scene_id).desc())
|
|
.limit(top_n * 5) # buffer to filter
|
|
).all()
|
|
|
|
# bucket
|
|
only_stashdb: list[tuple] = []
|
|
only_tpdb: list[tuple] = []
|
|
no_canonical: list[tuple] = []
|
|
both: list[tuple] = []
|
|
for pid, name, count, has_t, has_s in rows:
|
|
if has_t and has_s:
|
|
both.append((name, count))
|
|
elif has_s and not has_t:
|
|
only_stashdb.append((name, count))
|
|
elif has_t and not has_s:
|
|
only_tpdb.append((name, count))
|
|
else:
|
|
no_canonical.append((name, count))
|
|
|
|
print(f"=== Top {top_n*5} performers by scene_count ===")
|
|
print(f" both canonical refs: {len(both)}")
|
|
print(f" only stashdb (miss TPDB): {len(only_stashdb)}")
|
|
print(f" only tpdb (miss StashDB): {len(only_tpdb)}")
|
|
print(f" no canonical refs: {len(no_canonical)}")
|
|
|
|
print(f"\n--- Top {top_n} missing TPDB (have stashdb) ---")
|
|
for name, c in only_stashdb[:top_n]:
|
|
print(f" {c:5d} {name}")
|
|
print(f"\n--- Top {top_n} missing StashDB (have tpdb) ---")
|
|
for name, c in only_tpdb[:top_n]:
|
|
print(f" {c:5d} {name}")
|
|
print(f"\n--- Top {top_n} no canonical refs at all (probably scraper-only or alias issue) ---")
|
|
for name, c in no_canonical[:top_n]:
|
|
print(f" {c:5d} {name}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
n = int(sys.argv[1]) if len(sys.argv) > 1 else 30
|
|
main(n)
|