goon/scripts/backfill_durations.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

96 lines
3.6 KiB
Python

"""Backfill duration_sec dla playback_sources gdzie NULL i origin w setupie.
Uzasadnienie: 2026-05-17 zmieniliśmy `duration_extract.py` (dodanie `itemprop`
+ `PnDTnHnMnS` ISO format). Sceny ingested wcześniej mają `duration_sec=NULL`
mimo że detail page faktycznie ma duration. Ten skrypt re-fetchuje i updatuje.
Bezpieczny do uruchomienia wielokrotnie — `WHERE duration_sec IS NULL` filter
sprawia że już zbackfillowane są pomijane.
Uruchomienie: `python -m scripts.backfill_durations --origin tube:xvideoscom --limit 500`
Albo wszystkie problematyczne: `--origins tube:xvideoscom,tube:xnxxcom,...`
"""
from __future__ import annotations
import argparse
import logging
import time
from collections.abc import Sequence
from sqlalchemy import select, update
from app.db import session_scope
from app.extractors._fetch import browser_get
from app.extractors.duration_extract import extract_duration_sec
from app.models.playback_source import PlaybackSource
log = logging.getLogger(__name__)
def backfill(origins: Sequence[str], limit: int, throttle_sec: float) -> dict:
counters = {"checked": 0, "updated": 0, "extract_fail": 0, "fetch_fail": 0}
with session_scope() as session:
rows = session.execute(
select(PlaybackSource.id, PlaybackSource.origin, PlaybackSource.page_url)
.where(PlaybackSource.origin.in_(origins))
.where(PlaybackSource.duration_sec.is_(None))
.where(PlaybackSource.dead_at.is_(None))
.limit(limit)
).all()
total = len(rows)
log.info("backfill start: origins=%s rows=%d", list(origins), total)
for i, (pb_id, origin, url) in enumerate(rows, 1):
counters["checked"] += 1
try:
r = browser_get(url, timeout=10.0, follow_redirects=True)
except Exception as e:
counters["fetch_fail"] += 1
log.debug("fetch fail %s: %s", url, e)
time.sleep(throttle_sec)
continue
if r.status_code >= 400:
counters["fetch_fail"] += 1
time.sleep(throttle_sec)
continue
dur = extract_duration_sec(r.text)
if dur is None:
counters["extract_fail"] += 1
else:
session.execute(
update(PlaybackSource)
.where(PlaybackSource.id == pb_id)
.values(duration_sec=dur)
)
counters["updated"] += 1
if i % 25 == 0:
session.commit()
log.info("progress %d/%d updated=%d", i, total, counters["updated"])
time.sleep(throttle_sec)
session.commit()
return counters
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument(
"--origins",
default="tube:xvideoscom,tube:xnxxcom,tube:pornhatcom,tube:sxylandcom,tube:0dayxxcom",
help="CSV originów do backfillu",
)
parser.add_argument("--limit", type=int, default=2000, help="Max rows per run")
parser.add_argument("--throttle", type=float, default=0.35, help="Sekundy między requestami")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
origins = [o.strip() for o in args.origins.split(",") if o.strip()]
counters = backfill(origins, args.limit, args.throttle)
log.info("backfill done: %s", counters)
return 0
if __name__ == "__main__":
raise SystemExit(main())