Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
121 lines
4.6 KiB
Python
121 lines
4.6 KiB
Python
"""One-shot: re-fetch pandamovies movies + dorzuć sub-host playback_sources.
|
|
|
|
Theme pandamovies zmienił markup z `<li class="hosts-buttons-wpx">` na
|
|
`<div class="Rtable1-cell">` (data ~2026-04). Stary regex w `dooplay.py` nie
|
|
łapał nowego markupu — wszystkie 15.9k pandamovies movies w DB mają TYLKO
|
|
raw landing entry (origin='pandamovies'), zero sub-hosters.
|
|
|
|
Ten skrypt re-fetcha każde movie pandamovies (lub subset jeśli `--ids`)
|
|
i upsertuje sub-host playback_sources (origin='pandamovies:<host>') do
|
|
movie_playback_sources. NIE dotyka raw landing entry — to ukrywane jest
|
|
w GET /movies/{id} jeśli movie ma sub-hosters.
|
|
|
|
Hash-skip bypass: zamiast iść przez `ingest_movies_from_connector` (które
|
|
porównuje raw_hash i pomija jeśli no change), wywołujemy `_fetch_detail`
|
|
bezpośrednio i upsertujemy do MoviePlaybackSource ręcznie. ON CONFLICT
|
|
(movie_id, origin, page_url) DO NOTHING zapewnia idempotencję.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
import time
|
|
import uuid
|
|
|
|
from sqlalchemy import select, text
|
|
|
|
sys.path.insert(0, "/srv")
|
|
from app.connectors.dooplay import PandamoviesConnector
|
|
from app.db import SessionLocal
|
|
from app.models.movie import MovieExternalRef
|
|
from app.models.movie_playback_source import MoviePlaybackSource # noqa: F401 (ORM table registration)
|
|
from app.models.source import Source
|
|
|
|
log = logging.getLogger("reingest_panda")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--ids", nargs="+", help="Limit do tych movie UUIDs (default: wszystkie pandamovies)")
|
|
ap.add_argument("--limit", type=int, default=None, help="Max movies (testing)")
|
|
ap.add_argument("--sleep", type=float, default=0.5, help="Sleep between fetches (rate-limit)")
|
|
args = ap.parse_args()
|
|
|
|
with SessionLocal() as session:
|
|
source = session.execute(select(Source).where(Source.name == "pandamovies")).scalar_one()
|
|
|
|
q = select(MovieExternalRef.movie_id, MovieExternalRef.url).where(
|
|
MovieExternalRef.source_id == source.id
|
|
)
|
|
if args.ids:
|
|
ids = [uuid.UUID(i) for i in args.ids]
|
|
q = q.where(MovieExternalRef.movie_id.in_(ids))
|
|
q = q.order_by(MovieExternalRef.movie_id)
|
|
if args.limit:
|
|
q = q.limit(args.limit)
|
|
|
|
rows = session.execute(q).all()
|
|
|
|
log.info("re-ingesting %d pandamovies refs", len(rows))
|
|
|
|
conn = PandamoviesConnector()
|
|
stats = {"fetched": 0, "ok": 0, "fail": 0, "added": 0, "skipped_existing": 0}
|
|
|
|
for movie_id, url in rows:
|
|
stats["fetched"] += 1
|
|
try:
|
|
raw_movie = conn._fetch_detail(url)
|
|
except Exception as e:
|
|
log.warning("fetch fail %s: %s", url, e)
|
|
stats["fail"] += 1
|
|
continue
|
|
if raw_movie is None:
|
|
stats["fail"] += 1
|
|
continue
|
|
|
|
added = 0
|
|
with SessionLocal() as session, session.begin():
|
|
for pb in raw_movie.playback_sources:
|
|
if pb.origin == "pandamovies":
|
|
continue # landing — skip, mamy go już z poprzedniego ingest
|
|
# Idempotent insert via PostgreSQL ON CONFLICT
|
|
result = session.execute(
|
|
text(
|
|
"""
|
|
INSERT INTO movie_playback_sources (
|
|
id, movie_id, origin, page_url, embed_url,
|
|
thumbnail_url, duration_sec
|
|
) VALUES (
|
|
gen_random_uuid(), :movie_id, :origin, :page_url, :embed_url,
|
|
:thumbnail_url, :duration_sec
|
|
)
|
|
ON CONFLICT (origin, page_url) DO NOTHING
|
|
"""
|
|
),
|
|
{
|
|
"movie_id": movie_id,
|
|
"origin": pb.origin,
|
|
"page_url": pb.page_url,
|
|
"embed_url": pb.embed_url,
|
|
"thumbnail_url": pb.thumbnail_url,
|
|
"duration_sec": pb.duration_sec,
|
|
},
|
|
)
|
|
if result.rowcount > 0:
|
|
added += 1
|
|
else:
|
|
stats["skipped_existing"] += 1
|
|
|
|
stats["ok"] += 1
|
|
stats["added"] += added
|
|
if stats["fetched"] % 50 == 0:
|
|
log.info("progress %d/%d: %s", stats["fetched"], len(rows), stats)
|
|
time.sleep(args.sleep)
|
|
|
|
log.info("DONE %s", stats)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|