goon/scripts/reresolve_freshporno_orphans.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

99 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Re-resolve freshporno orphans przeciw świeżej canonical bazie.
Dla każdej freshporno scene bez TPDB/StashDB ref:
- bierze jej phash(e)
- find_by_phash_within w hamming ≤ 5
- jeśli match → score scene pair → auto-merge gdy threshold met
run_bulk_dedup (strategia 'phash' / O(N²) cross-product) OOM-uje przy 197k phashes
(7GB RAM w worker container, OOM-killer zabija). Per-orphan approach robi 11k
queries × 197k seq scan = 2.2 mld comparisons ale stream-wise, ~50 MB RAM peak.
"""
from __future__ import annotations
import logging
import uuid
from sqlalchemy import select
from app.db import session_scope
from app.models.scene import Scene, SceneFingerprint
from app.resolve.scene_match import find_by_phash_within
from app.scheduler.bulk_dedup import _process_pair, BulkCounters
from app.config import get_settings
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
log = logging.getLogger(__name__)
ORPHANS_QUERY = """
SELECT DISTINCT ps.scene_id
FROM playback_sources ps
WHERE ps.origin = %s AND ps.dead_at IS NULL
AND NOT EXISTS (
SELECT 1 FROM scene_external_refs er JOIN sources s ON s.id = er.source_id
WHERE er.scene_id = ps.scene_id AND s.name IN (%s, %s)
)
"""
def main() -> None:
settings = get_settings()
auto_t = settings.auto_merge_threshold
review_t = settings.review_threshold
max_h = settings.fingerprint_hamming_max
counters = BulkCounters()
# Etap 1: zbieram listę orphan scene IDs
with session_scope() as sess:
rows = list(sess.connection().exec_driver_sql(
ORPHANS_QUERY, ("tube:freshpornoorg", "tpdb", "stashdb"),
))
orphan_ids = [r[0] for r in rows]
log.info("found %d freshporno orphans to re-resolve", len(orphan_ids))
# Etap 2: per orphan, query phash match
candidates_seen: set[tuple[uuid.UUID, uuid.UUID]] = set()
for i, orphan_id in enumerate(orphan_ids, 1):
if i % 500 == 0:
log.info("progress %d/%d, candidates=%d, counters=%s",
i, len(orphan_ids), len(candidates_seen), counters)
try:
with session_scope() as sess:
# Bierz phashe orphana
phashes = sess.execute(
select(SceneFingerprint.value).where(
SceneFingerprint.scene_id == orphan_id,
SceneFingerprint.kind == "phash",
)
).scalars().all()
if not phashes:
continue
for ph in phashes:
match = find_by_phash_within(sess, phash=ph, max_hamming=max_h)
if match is None:
continue
matched_scene, distance = match
if matched_scene.id == orphan_id:
continue
pair = (orphan_id, matched_scene.id) if orphan_id < matched_scene.id else (matched_scene.id, orphan_id)
if pair in candidates_seen:
break # już sprawdzony candidate
candidates_seen.add(pair)
# Process pair w osobnej transakcji
_process_pair(
pair[0], pair[1],
auto_t=auto_t, review_t=review_t,
dry_run=False, counters=counters,
cross_source_only=False,
)
break # 1 match wystarcza na orphan
except Exception as e:
log.exception("orphan %s re-resolve failed: %s", orphan_id, e)
log.info("DONE: orphans=%d candidates=%d counters=%s",
len(orphan_ids), len(candidates_seen), counters)
if __name__ == "__main__":
main()