Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
96 lines
3.6 KiB
Python
96 lines
3.6 KiB
Python
"""Backfill duration_sec dla playback_sources gdzie NULL i origin w setupie.
|
|
|
|
Uzasadnienie: 2026-05-17 zmieniliśmy `duration_extract.py` (dodanie `itemprop`
|
|
+ `PnDTnHnMnS` ISO format). Sceny ingested wcześniej mają `duration_sec=NULL`
|
|
mimo że detail page faktycznie ma duration. Ten skrypt re-fetchuje i updatuje.
|
|
|
|
Bezpieczny do uruchomienia wielokrotnie — `WHERE duration_sec IS NULL` filter
|
|
sprawia że już zbackfillowane są pomijane.
|
|
|
|
Uruchomienie: `python -m scripts.backfill_durations --origin tube:xvideoscom --limit 500`
|
|
Albo wszystkie problematyczne: `--origins tube:xvideoscom,tube:xnxxcom,...`
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import time
|
|
from collections.abc import Sequence
|
|
|
|
from sqlalchemy import select, update
|
|
|
|
from app.db import session_scope
|
|
from app.extractors._fetch import browser_get
|
|
from app.extractors.duration_extract import extract_duration_sec
|
|
from app.models.playback_source import PlaybackSource
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def backfill(origins: Sequence[str], limit: int, throttle_sec: float) -> dict:
|
|
counters = {"checked": 0, "updated": 0, "extract_fail": 0, "fetch_fail": 0}
|
|
with session_scope() as session:
|
|
rows = session.execute(
|
|
select(PlaybackSource.id, PlaybackSource.origin, PlaybackSource.page_url)
|
|
.where(PlaybackSource.origin.in_(origins))
|
|
.where(PlaybackSource.duration_sec.is_(None))
|
|
.where(PlaybackSource.dead_at.is_(None))
|
|
.limit(limit)
|
|
).all()
|
|
|
|
total = len(rows)
|
|
log.info("backfill start: origins=%s rows=%d", list(origins), total)
|
|
for i, (pb_id, origin, url) in enumerate(rows, 1):
|
|
counters["checked"] += 1
|
|
try:
|
|
r = browser_get(url, timeout=10.0, follow_redirects=True)
|
|
except Exception as e:
|
|
counters["fetch_fail"] += 1
|
|
log.debug("fetch fail %s: %s", url, e)
|
|
time.sleep(throttle_sec)
|
|
continue
|
|
if r.status_code >= 400:
|
|
counters["fetch_fail"] += 1
|
|
time.sleep(throttle_sec)
|
|
continue
|
|
dur = extract_duration_sec(r.text)
|
|
if dur is None:
|
|
counters["extract_fail"] += 1
|
|
else:
|
|
session.execute(
|
|
update(PlaybackSource)
|
|
.where(PlaybackSource.id == pb_id)
|
|
.values(duration_sec=dur)
|
|
)
|
|
counters["updated"] += 1
|
|
if i % 25 == 0:
|
|
session.commit()
|
|
log.info("progress %d/%d updated=%d", i, total, counters["updated"])
|
|
time.sleep(throttle_sec)
|
|
session.commit()
|
|
return counters
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--origins",
|
|
default="tube:xvideoscom,tube:xnxxcom,tube:pornhatcom,tube:sxylandcom,tube:0dayxxcom",
|
|
help="CSV originów do backfillu",
|
|
)
|
|
parser.add_argument("--limit", type=int, default=2000, help="Max rows per run")
|
|
parser.add_argument("--throttle", type=float, default=0.35, help="Sekundy między requestami")
|
|
parser.add_argument("--verbose", action="store_true")
|
|
args = parser.parse_args()
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
)
|
|
origins = [o.strip() for o in args.origins.split(",") if o.strip()]
|
|
counters = backfill(origins, args.limit, args.throttle)
|
|
log.info("backfill done: %s", counters)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|