"""One-shot: re-fetch pandamovies movies + dorzuć sub-host playback_sources.
Theme pandamovies zmienił markup z `
` na
`` (data ~2026-04). Stary regex w `dooplay.py` nie
łapał nowego markupu — wszystkie 15.9k pandamovies movies w DB mają TYLKO
raw landing entry (origin='pandamovies'), zero sub-hosters.
Ten skrypt re-fetcha każde movie pandamovies (lub subset jeśli `--ids`)
i upsertuje sub-host playback_sources (origin='pandamovies:') do
movie_playback_sources. NIE dotyka raw landing entry — to ukrywane jest
w GET /movies/{id} jeśli movie ma sub-hosters.
Hash-skip bypass: zamiast iść przez `ingest_movies_from_connector` (które
porównuje raw_hash i pomija jeśli no change), wywołujemy `_fetch_detail`
bezpośrednio i upsertujemy do MoviePlaybackSource ręcznie. ON CONFLICT
(movie_id, origin, page_url) DO NOTHING zapewnia idempotencję.
"""
from __future__ import annotations
import argparse
import logging
import sys
import time
import uuid
from sqlalchemy import select, text
sys.path.insert(0, "/srv")
from app.connectors.dooplay import PandamoviesConnector
from app.db import SessionLocal
from app.models.movie import MovieExternalRef
from app.models.movie_playback_source import MoviePlaybackSource # noqa: F401 (ORM table registration)
from app.models.source import Source
log = logging.getLogger("reingest_panda")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--ids", nargs="+", help="Limit do tych movie UUIDs (default: wszystkie pandamovies)")
ap.add_argument("--limit", type=int, default=None, help="Max movies (testing)")
ap.add_argument("--sleep", type=float, default=0.5, help="Sleep between fetches (rate-limit)")
args = ap.parse_args()
with SessionLocal() as session:
source = session.execute(select(Source).where(Source.name == "pandamovies")).scalar_one()
q = select(MovieExternalRef.movie_id, MovieExternalRef.url).where(
MovieExternalRef.source_id == source.id
)
if args.ids:
ids = [uuid.UUID(i) for i in args.ids]
q = q.where(MovieExternalRef.movie_id.in_(ids))
q = q.order_by(MovieExternalRef.movie_id)
if args.limit:
q = q.limit(args.limit)
rows = session.execute(q).all()
log.info("re-ingesting %d pandamovies refs", len(rows))
conn = PandamoviesConnector()
stats = {"fetched": 0, "ok": 0, "fail": 0, "added": 0, "skipped_existing": 0}
for movie_id, url in rows:
stats["fetched"] += 1
try:
raw_movie = conn._fetch_detail(url)
except Exception as e:
log.warning("fetch fail %s: %s", url, e)
stats["fail"] += 1
continue
if raw_movie is None:
stats["fail"] += 1
continue
added = 0
with SessionLocal() as session, session.begin():
for pb in raw_movie.playback_sources:
if pb.origin == "pandamovies":
continue # landing — skip, mamy go już z poprzedniego ingest
# Idempotent insert via PostgreSQL ON CONFLICT
result = session.execute(
text(
"""
INSERT INTO movie_playback_sources (
id, movie_id, origin, page_url, embed_url,
thumbnail_url, duration_sec
) VALUES (
gen_random_uuid(), :movie_id, :origin, :page_url, :embed_url,
:thumbnail_url, :duration_sec
)
ON CONFLICT (origin, page_url) DO NOTHING
"""
),
{
"movie_id": movie_id,
"origin": pb.origin,
"page_url": pb.page_url,
"embed_url": pb.embed_url,
"thumbnail_url": pb.thumbnail_url,
"duration_sec": pb.duration_sec,
},
)
if result.rowcount > 0:
added += 1
else:
stats["skipped_existing"] += 1
stats["ok"] += 1
stats["added"] += added
if stats["fetched"] % 50 == 0:
log.info("progress %d/%d: %s", stats["fetched"], len(rows), stats)
time.sleep(args.sleep)
log.info("DONE %s", stats)
if __name__ == "__main__":
main()