Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
99 lines
3.7 KiB
Python
99 lines
3.7 KiB
Python
"""Re-resolve freshporno orphans przeciw świeżej canonical bazie.
|
||
|
||
Dla każdej freshporno scene bez TPDB/StashDB ref:
|
||
- bierze jej phash(e)
|
||
- find_by_phash_within w hamming ≤ 5
|
||
- jeśli match → score scene pair → auto-merge gdy threshold met
|
||
|
||
run_bulk_dedup (strategia 'phash' / O(N²) cross-product) OOM-uje przy 197k phashes
|
||
(7GB RAM w worker container, OOM-killer zabija). Per-orphan approach robi 11k
|
||
queries × 197k seq scan = 2.2 mld comparisons ale stream-wise, ~50 MB RAM peak.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import uuid
|
||
|
||
from sqlalchemy import select
|
||
|
||
from app.db import session_scope
|
||
from app.models.scene import Scene, SceneFingerprint
|
||
from app.resolve.scene_match import find_by_phash_within
|
||
from app.scheduler.bulk_dedup import _process_pair, BulkCounters
|
||
from app.config import get_settings
|
||
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
ORPHANS_QUERY = """
|
||
SELECT DISTINCT ps.scene_id
|
||
FROM playback_sources ps
|
||
WHERE ps.origin = %s AND ps.dead_at IS NULL
|
||
AND NOT EXISTS (
|
||
SELECT 1 FROM scene_external_refs er JOIN sources s ON s.id = er.source_id
|
||
WHERE er.scene_id = ps.scene_id AND s.name IN (%s, %s)
|
||
)
|
||
"""
|
||
|
||
|
||
def main() -> None:
|
||
settings = get_settings()
|
||
auto_t = settings.auto_merge_threshold
|
||
review_t = settings.review_threshold
|
||
max_h = settings.fingerprint_hamming_max
|
||
counters = BulkCounters()
|
||
|
||
# Etap 1: zbieram listę orphan scene IDs
|
||
with session_scope() as sess:
|
||
rows = list(sess.connection().exec_driver_sql(
|
||
ORPHANS_QUERY, ("tube:freshpornoorg", "tpdb", "stashdb"),
|
||
))
|
||
orphan_ids = [r[0] for r in rows]
|
||
log.info("found %d freshporno orphans to re-resolve", len(orphan_ids))
|
||
|
||
# Etap 2: per orphan, query phash match
|
||
candidates_seen: set[tuple[uuid.UUID, uuid.UUID]] = set()
|
||
for i, orphan_id in enumerate(orphan_ids, 1):
|
||
if i % 500 == 0:
|
||
log.info("progress %d/%d, candidates=%d, counters=%s",
|
||
i, len(orphan_ids), len(candidates_seen), counters)
|
||
try:
|
||
with session_scope() as sess:
|
||
# Bierz phashe orphana
|
||
phashes = sess.execute(
|
||
select(SceneFingerprint.value).where(
|
||
SceneFingerprint.scene_id == orphan_id,
|
||
SceneFingerprint.kind == "phash",
|
||
)
|
||
).scalars().all()
|
||
if not phashes:
|
||
continue
|
||
for ph in phashes:
|
||
match = find_by_phash_within(sess, phash=ph, max_hamming=max_h)
|
||
if match is None:
|
||
continue
|
||
matched_scene, distance = match
|
||
if matched_scene.id == orphan_id:
|
||
continue
|
||
pair = (orphan_id, matched_scene.id) if orphan_id < matched_scene.id else (matched_scene.id, orphan_id)
|
||
if pair in candidates_seen:
|
||
break # już sprawdzony candidate
|
||
candidates_seen.add(pair)
|
||
# Process pair w osobnej transakcji
|
||
_process_pair(
|
||
pair[0], pair[1],
|
||
auto_t=auto_t, review_t=review_t,
|
||
dry_run=False, counters=counters,
|
||
cross_source_only=False,
|
||
)
|
||
break # 1 match wystarcza na orphan
|
||
except Exception as e:
|
||
log.exception("orphan %s re-resolve failed: %s", orphan_id, e)
|
||
|
||
log.info("DONE: orphans=%d candidates=%d counters=%s",
|
||
len(orphan_ids), len(candidates_seen), counters)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|