feat(sxyprn): refresh rotting thumbnails from live post pages + scheduled job
CORRECTION to earlier "unrecoverable" call: the /post/<id> page is alive (200) and DOES expose the scene's own fresh-signed poster via og:image / <video poster> (post-id embedded, current timestamp) — only the STORED thumbnail URL had rotted. Search/listings don't re-surface old posts (0 overlap), but per-post fetch works. scripts/refresh_sxyprn_thumbs.py: iterate live sxyprn sources, fetch post page, extract fresh og:image, UPDATE thumbnail_url (verified: refreshed URLs return 200). _job_refresh_sxyprn_thumbs: every 12h refresh the 1200 least-recently-updated sources (cycles the ~19k catalog within the expiry window). Pairs with the scene_resolver overwrite fix so refreshed thumbnails stick. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
bb9e1afc31
commit
fef28ae56b
2 changed files with 154 additions and 0 deletions
|
|
@ -193,6 +193,48 @@ def _job_refresh_taxonomy_counts() -> None:
|
||||||
log.exception("[scheduler] taxonomy counts refresh failed")
|
log.exception("[scheduler] taxonomy counts refresh failed")
|
||||||
|
|
||||||
|
|
||||||
|
def _job_refresh_sxyprn_thumbs(batch: int = 1200) -> None:
|
||||||
|
"""Odświeża wygasłe sxyprn miniaturki z żywych stron /post/<id> (bug 2026-06-10).
|
||||||
|
|
||||||
|
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i rotują (token wygasa po
|
||||||
|
~tygodniach → 404), ale strona post żyje i ma świeży poster (`og:image`). Search/
|
||||||
|
listingi NIE re-surfaceują starych postów, więc jedyna droga to per-post page fetch.
|
||||||
|
Bierzemy `batch` najdawniej-aktualizowanych źródeł — cykl po całym katalogu co kilka
|
||||||
|
dni (mieści się w oknie wygaśnięcia). Patrz scripts/refresh_sxyprn_thumbs.py.
|
||||||
|
"""
|
||||||
|
log.info("[scheduler] sxyprn thumb refresh starting (batch=%d)", batch)
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db import session_scope
|
||||||
|
from scripts.refresh_sxyprn_thumbs import _fresh_thumb
|
||||||
|
|
||||||
|
with session_scope() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text(
|
||||||
|
"SELECT id, page_url FROM playback_sources "
|
||||||
|
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
|
||||||
|
"ORDER BY updated_at ASC LIMIT :n"
|
||||||
|
).bindparams(n=batch)
|
||||||
|
).all()
|
||||||
|
updated = 0
|
||||||
|
for pbid, page_url in rows:
|
||||||
|
thumb = _fresh_thumb(page_url)
|
||||||
|
if thumb:
|
||||||
|
with session_scope() as session:
|
||||||
|
session.execute(
|
||||||
|
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||||
|
t=thumb, i=pbid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
updated += 1
|
||||||
|
log.info("[scheduler] sxyprn thumb refresh done: %d/%d", updated, len(rows))
|
||||||
|
|
||||||
|
_run_with_timeout(_run, label="sxyprn-thumb-refresh")
|
||||||
|
|
||||||
|
|
||||||
def _job_bulk_dedup_performers() -> None:
|
def _job_bulk_dedup_performers() -> None:
|
||||||
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
|
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
|
||||||
resolver-time scoring nie złapał.
|
resolver-time scoring nie złapał.
|
||||||
|
|
@ -381,12 +423,30 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
||||||
)
|
)
|
||||||
log.info("scheduler: reap-stuck every %dh", reap_hours)
|
log.info("scheduler: reap-stuck every %dh", reap_hours)
|
||||||
|
|
||||||
|
# sxyprn thumbnail refresh — sxyprn miniaturki rotują (signed CDN, 404 po ~tygodniach).
|
||||||
|
# Domyślnie ZAWSZE on co 12h, batch najdawniej-aktualizowanych → cykl po katalogu w
|
||||||
|
# ~tydzień (mieści się w oknie wygaśnięcia). Bug 2026-06-10.
|
||||||
|
sxyprn_hours = cfg.get("sxyprn_thumb_refresh_hours", 12)
|
||||||
|
if sxyprn_hours:
|
||||||
|
batch = cfg.get("sxyprn_thumb_refresh_batch", 1200)
|
||||||
|
sched.add_job(
|
||||||
|
lambda: _job_refresh_sxyprn_thumbs(batch),
|
||||||
|
IntervalTrigger(hours=sxyprn_hours, start_date=INTERVAL_ANCHOR),
|
||||||
|
id="sxyprn_thumb_refresh",
|
||||||
|
replace_existing=True,
|
||||||
|
max_instances=1,
|
||||||
|
coalesce=True,
|
||||||
|
)
|
||||||
|
log.info("scheduler: sxyprn-thumb-refresh every %dh (batch=%d)", sxyprn_hours, batch)
|
||||||
|
|
||||||
return sched
|
return sched
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG: dict[str, Any] = {
|
DEFAULT_CONFIG: dict[str, Any] = {
|
||||||
"tpdb_hours": 6,
|
"tpdb_hours": 6,
|
||||||
"stashdb_hours": 6,
|
"stashdb_hours": 6,
|
||||||
|
"sxyprn_thumb_refresh_hours": 12,
|
||||||
|
"sxyprn_thumb_refresh_batch": 1200,
|
||||||
"performer_driven_hours": 12,
|
"performer_driven_hours": 12,
|
||||||
"performer_driven_top_n": 20,
|
"performer_driven_top_n": 20,
|
||||||
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100
|
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100
|
||||||
|
|
|
||||||
94
scripts/refresh_sxyprn_thumbs.py
Normal file
94
scripts/refresh_sxyprn_thumbs.py
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
|
||||||
|
|
||||||
|
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
||||||
|
~tygodniach → 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
|
||||||
|
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
|
||||||
|
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
|
||||||
|
pobranie per-post page.
|
||||||
|
|
||||||
|
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
|
||||||
|
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
|
||||||
|
Thumbnaile dalej rotują → odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
|
||||||
|
|
||||||
|
Użycie (kontener worker):
|
||||||
|
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db import session_scope
|
||||||
|
from app.extractors import browser_get
|
||||||
|
|
||||||
|
_OG_RE = re.compile(r"og:image[\"'][^>]*content=[\"']([^\"']+)", re.IGNORECASE)
|
||||||
|
_OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image", re.IGNORECASE)
|
||||||
|
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _fresh_thumb(page_url: str) -> str | None:
|
||||||
|
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
|
||||||
|
martwa (Post Not Found) albo brak postera."""
|
||||||
|
try:
|
||||||
|
html = browser_get(page_url, timeout=25).text
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if "Post Not Found" in html:
|
||||||
|
return None
|
||||||
|
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
u = m.group(1).strip()
|
||||||
|
if u.startswith("//"):
|
||||||
|
u = "https:" + u
|
||||||
|
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
||||||
|
return None
|
||||||
|
return u
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
limit = 1_000_000
|
||||||
|
offset = 0
|
||||||
|
for i, a in enumerate(sys.argv):
|
||||||
|
if a == "--limit" and i + 1 < len(sys.argv):
|
||||||
|
limit = int(sys.argv[i + 1])
|
||||||
|
if a == "--offset" and i + 1 < len(sys.argv):
|
||||||
|
offset = int(sys.argv[i + 1])
|
||||||
|
|
||||||
|
with session_scope() as s:
|
||||||
|
rows = s.execute(
|
||||||
|
text(
|
||||||
|
"SELECT id, page_url FROM playback_sources "
|
||||||
|
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
|
||||||
|
"ORDER BY id OFFSET :off LIMIT :lim"
|
||||||
|
).bindparams(off=offset, lim=limit)
|
||||||
|
).all()
|
||||||
|
|
||||||
|
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
|
||||||
|
updated = 0
|
||||||
|
dead = 0
|
||||||
|
nothumb = 0
|
||||||
|
for idx, (pbid, page_url) in enumerate(rows):
|
||||||
|
thumb = _fresh_thumb(page_url)
|
||||||
|
if thumb:
|
||||||
|
with session_scope() as s:
|
||||||
|
s.execute(
|
||||||
|
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||||
|
t=thumb, i=pbid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
s.commit()
|
||||||
|
updated += 1
|
||||||
|
else:
|
||||||
|
nothumb += 1
|
||||||
|
if (idx + 1) % 200 == 0:
|
||||||
|
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
|
||||||
|
time.sleep(0.25)
|
||||||
|
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Reference in a new issue