Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
"""Wyszukiwanie kandydatów do dedup movies — blocking + fuzzy title prefilter.
|
|
|
|
Strategia: nie chcemy O(N) score'ować wszystkich filmów dla każdego nowego.
|
|
Blocking: kandydat musi mieć title trigram similarity ≥0.4 (pg_trgm) z incoming title,
|
|
ALBO (same studio AND year w oknie ±1). Plus pierwsze X (np. 50) filmów z każdego
|
|
zbioru — wystarczająco szeroko żeby nie zgubić matchów, dostatecznie wąsko żeby
|
|
score'ować szybko.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from datetime import date
|
|
|
|
from sqlalchemy import or_, select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.models.movie import Movie
|
|
|
|
|
|
def find_movie_candidates(
|
|
session: Session,
|
|
*,
|
|
title_normalized: str,
|
|
studio_id: uuid.UUID | None,
|
|
release_year: int | None,
|
|
limit: int = 50,
|
|
) -> list[Movie]:
|
|
"""Zwraca kandydatów do score'owania.
|
|
|
|
Trigram threshold: pg_trgm `%` operator domyślnie ma similarity ≥0.3 — zniżamy
|
|
do 0.4 dla jakości. Plus secondary blocking po (studio_id, year ±1) jako safety
|
|
net dla SEO tytułów (np. mangoporn często ma "Watch Cece adult..." vs paradisehill
|
|
"Cece" — trigram złapie, ale gdyby nie, studio+year wystarczają).
|
|
"""
|
|
from sqlalchemy import literal
|
|
|
|
q = session.execute(
|
|
select(Movie)
|
|
.where(
|
|
or_(
|
|
# pg_trgm `%` — wymagamy minimum bound 0.4 (set per-session pewnie low default).
|
|
# Bezpieczniej użyć similarity() funkcji bezpośrednio z literalnym threshold.
|
|
Movie.title_normalized.op("%")(literal(title_normalized)),
|
|
# Studio + year ±1 jako fallback blocking
|
|
_studio_year_block(studio_id, release_year),
|
|
)
|
|
)
|
|
.limit(limit)
|
|
).scalars()
|
|
return list(q)
|
|
|
|
|
|
def _studio_year_block(studio_id: uuid.UUID | None, release_year: int | None):
|
|
from sqlalchemy import and_, false
|
|
if studio_id is None or release_year is None:
|
|
return false()
|
|
return and_(
|
|
Movie.studio_id == studio_id,
|
|
Movie.release_year.between(release_year - 1, release_year + 1),
|
|
)
|