feat(sxyprn): refresh rotting thumbnails from live post pages + scheduled job

CORRECTION to earlier "unrecoverable" call: the /post/<id> page is alive (200) and
DOES expose the scene's own fresh-signed poster via og:image / <video poster>
(post-id embedded, current timestamp) — only the STORED thumbnail URL had rotted.
Search/listings don't re-surface old posts (0 overlap), but per-post fetch works.

scripts/refresh_sxyprn_thumbs.py: iterate live sxyprn sources, fetch post page,
extract fresh og:image, UPDATE thumbnail_url (verified: refreshed URLs return 200).
_job_refresh_sxyprn_thumbs: every 12h refresh the 1200 least-recently-updated sources
(cycles the ~19k catalog within the expiry window). Pairs with the scene_resolver
overwrite fix so refreshed thumbnails stick.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-10 10:36:30 +02:00
parent bb9e1afc31
commit fef28ae56b
2 changed files with 154 additions and 0 deletions

View file

@ -193,6 +193,48 @@ def _job_refresh_taxonomy_counts() -> None:
log.exception("[scheduler] taxonomy counts refresh failed") log.exception("[scheduler] taxonomy counts refresh failed")
def _job_refresh_sxyprn_thumbs(batch: int = 1200) -> None:
"""Odświeża wygasłe sxyprn miniaturki z żywych stron /post/<id> (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile podpisane czasowo i rotują (token wygasa po
~tygodniach 404), ale strona post żyje i ma świeży poster (`og:image`). Search/
listingi NIE re-surfaceują starych postów, więc jedyna droga to per-post page fetch.
Bierzemy `batch` najdawniej-aktualizowanych źródeł cykl po całym katalogu co kilka
dni (mieści się w oknie wygaśnięcia). Patrz scripts/refresh_sxyprn_thumbs.py.
"""
log.info("[scheduler] sxyprn thumb refresh starting (batch=%d)", batch)
def _run() -> None:
from sqlalchemy import text
from app.db import session_scope
from scripts.refresh_sxyprn_thumbs import _fresh_thumb
with session_scope() as session:
rows = session.execute(
text(
"SELECT id, page_url FROM playback_sources "
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
"ORDER BY updated_at ASC LIMIT :n"
).bindparams(n=batch)
).all()
updated = 0
for pbid, page_url in rows:
thumb = _fresh_thumb(page_url)
if thumb:
with session_scope() as session:
session.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=thumb, i=pbid
)
)
session.commit()
updated += 1
log.info("[scheduler] sxyprn thumb refresh done: %d/%d", updated, len(rows))
_run_with_timeout(_run, label="sxyprn-thumb-refresh")
def _job_bulk_dedup_performers() -> None: def _job_bulk_dedup_performers() -> None:
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które """Pair-wise dedup po performer overlap — safety net dla duplikatów które
resolver-time scoring nie złapał. resolver-time scoring nie złapał.
@ -381,12 +423,30 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
) )
log.info("scheduler: reap-stuck every %dh", reap_hours) log.info("scheduler: reap-stuck every %dh", reap_hours)
# sxyprn thumbnail refresh — sxyprn miniaturki rotują (signed CDN, 404 po ~tygodniach).
# Domyślnie ZAWSZE on co 12h, batch najdawniej-aktualizowanych → cykl po katalogu w
# ~tydzień (mieści się w oknie wygaśnięcia). Bug 2026-06-10.
sxyprn_hours = cfg.get("sxyprn_thumb_refresh_hours", 12)
if sxyprn_hours:
batch = cfg.get("sxyprn_thumb_refresh_batch", 1200)
sched.add_job(
lambda: _job_refresh_sxyprn_thumbs(batch),
IntervalTrigger(hours=sxyprn_hours, start_date=INTERVAL_ANCHOR),
id="sxyprn_thumb_refresh",
replace_existing=True,
max_instances=1,
coalesce=True,
)
log.info("scheduler: sxyprn-thumb-refresh every %dh (batch=%d)", sxyprn_hours, batch)
return sched return sched
DEFAULT_CONFIG: dict[str, Any] = { DEFAULT_CONFIG: dict[str, Any] = {
"tpdb_hours": 6, "tpdb_hours": 6,
"stashdb_hours": 6, "stashdb_hours": 6,
"sxyprn_thumb_refresh_hours": 12,
"sxyprn_thumb_refresh_batch": 1200,
"performer_driven_hours": 12, "performer_driven_hours": 12,
"performer_driven_top_n": 20, "performer_driven_top_n": 20,
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100 # Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100

View file

@ -0,0 +1,94 @@
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile podpisane czasowo i ROTUJĄ (token wygasa po
~tygodniach 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
pobranie per-post page.
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
Thumbnaile dalej rotują odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
Użycie (kontener worker):
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
"""
from __future__ import annotations
import re
import sys
import time
from sqlalchemy import text
from app.db import session_scope
from app.extractors import browser_get
_OG_RE = re.compile(r"og:image[\"'][^>]*content=[\"']([^\"']+)", re.IGNORECASE)
_OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image", re.IGNORECASE)
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
def _fresh_thumb(page_url: str) -> str | None:
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
martwa (Post Not Found) albo brak postera."""
try:
html = browser_get(page_url, timeout=25).text
except Exception:
return None
if "Post Not Found" in html:
return None
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
if not m:
return None
u = m.group(1).strip()
if u.startswith("//"):
u = "https:" + u
if "trafficdeposit.com" not in u and "sxyprn" not in u:
return None
return u
def main() -> None:
limit = 1_000_000
offset = 0
for i, a in enumerate(sys.argv):
if a == "--limit" and i + 1 < len(sys.argv):
limit = int(sys.argv[i + 1])
if a == "--offset" and i + 1 < len(sys.argv):
offset = int(sys.argv[i + 1])
with session_scope() as s:
rows = s.execute(
text(
"SELECT id, page_url FROM playback_sources "
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
"ORDER BY id OFFSET :off LIMIT :lim"
).bindparams(off=offset, lim=limit)
).all()
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
updated = 0
dead = 0
nothumb = 0
for idx, (pbid, page_url) in enumerate(rows):
thumb = _fresh_thumb(page_url)
if thumb:
with session_scope() as s:
s.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=thumb, i=pbid
)
)
s.commit()
updated += 1
else:
nothumb += 1
if (idx + 1) % 200 == 0:
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
time.sleep(0.25)
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
if __name__ == "__main__":
main()