"""One-shot: re-fetch pandamovies movies + dorzuć sub-host playback_sources. Theme pandamovies zmienił markup z `
  • ` na `
    ` (data ~2026-04). Stary regex w `dooplay.py` nie łapał nowego markupu — wszystkie 15.9k pandamovies movies w DB mają TYLKO raw landing entry (origin='pandamovies'), zero sub-hosters. Ten skrypt re-fetcha każde movie pandamovies (lub subset jeśli `--ids`) i upsertuje sub-host playback_sources (origin='pandamovies:') do movie_playback_sources. NIE dotyka raw landing entry — to ukrywane jest w GET /movies/{id} jeśli movie ma sub-hosters. Hash-skip bypass: zamiast iść przez `ingest_movies_from_connector` (które porównuje raw_hash i pomija jeśli no change), wywołujemy `_fetch_detail` bezpośrednio i upsertujemy do MoviePlaybackSource ręcznie. ON CONFLICT (movie_id, origin, page_url) DO NOTHING zapewnia idempotencję. """ from __future__ import annotations import argparse import logging import sys import time import uuid from sqlalchemy import select, text sys.path.insert(0, "/srv") from app.connectors.dooplay import PandamoviesConnector from app.db import SessionLocal from app.models.movie import MovieExternalRef from app.models.movie_playback_source import MoviePlaybackSource # noqa: F401 (ORM table registration) from app.models.source import Source log = logging.getLogger("reingest_panda") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--ids", nargs="+", help="Limit do tych movie UUIDs (default: wszystkie pandamovies)") ap.add_argument("--limit", type=int, default=None, help="Max movies (testing)") ap.add_argument("--sleep", type=float, default=0.5, help="Sleep between fetches (rate-limit)") args = ap.parse_args() with SessionLocal() as session: source = session.execute(select(Source).where(Source.name == "pandamovies")).scalar_one() q = select(MovieExternalRef.movie_id, MovieExternalRef.url).where( MovieExternalRef.source_id == source.id ) if args.ids: ids = [uuid.UUID(i) for i in args.ids] q = q.where(MovieExternalRef.movie_id.in_(ids)) q = q.order_by(MovieExternalRef.movie_id) if args.limit: q = q.limit(args.limit) rows = session.execute(q).all() log.info("re-ingesting %d pandamovies refs", len(rows)) conn = PandamoviesConnector() stats = {"fetched": 0, "ok": 0, "fail": 0, "added": 0, "skipped_existing": 0} for movie_id, url in rows: stats["fetched"] += 1 try: raw_movie = conn._fetch_detail(url) except Exception as e: log.warning("fetch fail %s: %s", url, e) stats["fail"] += 1 continue if raw_movie is None: stats["fail"] += 1 continue added = 0 with SessionLocal() as session, session.begin(): for pb in raw_movie.playback_sources: if pb.origin == "pandamovies": continue # landing — skip, mamy go już z poprzedniego ingest # Idempotent insert via PostgreSQL ON CONFLICT result = session.execute( text( """ INSERT INTO movie_playback_sources ( id, movie_id, origin, page_url, embed_url, thumbnail_url, duration_sec ) VALUES ( gen_random_uuid(), :movie_id, :origin, :page_url, :embed_url, :thumbnail_url, :duration_sec ) ON CONFLICT (origin, page_url) DO NOTHING """ ), { "movie_id": movie_id, "origin": pb.origin, "page_url": pb.page_url, "embed_url": pb.embed_url, "thumbnail_url": pb.thumbnail_url, "duration_sec": pb.duration_sec, }, ) if result.rowcount > 0: added += 1 else: stats["skipped_existing"] += 1 stats["ok"] += 1 stats["added"] += added if stats["fetched"] % 50 == 0: log.info("progress %d/%d: %s", stats["fetched"], len(rows), stats) time.sleep(args.sleep) log.info("DONE %s", stats) if __name__ == "__main__": main()