feat(sxyprn): refresh rotting thumbnails from live post pages + scheduled job
CORRECTION to earlier "unrecoverable" call: the /post/<id> page is alive (200) and DOES expose the scene's own fresh-signed poster via og:image / <video poster> (post-id embedded, current timestamp) — only the STORED thumbnail URL had rotted. Search/listings don't re-surface old posts (0 overlap), but per-post fetch works. scripts/refresh_sxyprn_thumbs.py: iterate live sxyprn sources, fetch post page, extract fresh og:image, UPDATE thumbnail_url (verified: refreshed URLs return 200). _job_refresh_sxyprn_thumbs: every 12h refresh the 1200 least-recently-updated sources (cycles the ~19k catalog within the expiry window). Pairs with the scene_resolver overwrite fix so refreshed thumbnails stick. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
bb9e1afc31
commit
fef28ae56b
2 changed files with 154 additions and 0 deletions
|
|
@ -193,6 +193,48 @@ def _job_refresh_taxonomy_counts() -> None:
|
|||
log.exception("[scheduler] taxonomy counts refresh failed")
|
||||
|
||||
|
||||
def _job_refresh_sxyprn_thumbs(batch: int = 1200) -> None:
|
||||
"""Odświeża wygasłe sxyprn miniaturki z żywych stron /post/<id> (bug 2026-06-10).
|
||||
|
||||
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i rotują (token wygasa po
|
||||
~tygodniach → 404), ale strona post żyje i ma świeży poster (`og:image`). Search/
|
||||
listingi NIE re-surfaceują starych postów, więc jedyna droga to per-post page fetch.
|
||||
Bierzemy `batch` najdawniej-aktualizowanych źródeł — cykl po całym katalogu co kilka
|
||||
dni (mieści się w oknie wygaśnięcia). Patrz scripts/refresh_sxyprn_thumbs.py.
|
||||
"""
|
||||
log.info("[scheduler] sxyprn thumb refresh starting (batch=%d)", batch)
|
||||
|
||||
def _run() -> None:
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import session_scope
|
||||
from scripts.refresh_sxyprn_thumbs import _fresh_thumb
|
||||
|
||||
with session_scope() as session:
|
||||
rows = session.execute(
|
||||
text(
|
||||
"SELECT id, page_url FROM playback_sources "
|
||||
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
|
||||
"ORDER BY updated_at ASC LIMIT :n"
|
||||
).bindparams(n=batch)
|
||||
).all()
|
||||
updated = 0
|
||||
for pbid, page_url in rows:
|
||||
thumb = _fresh_thumb(page_url)
|
||||
if thumb:
|
||||
with session_scope() as session:
|
||||
session.execute(
|
||||
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||
t=thumb, i=pbid
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
updated += 1
|
||||
log.info("[scheduler] sxyprn thumb refresh done: %d/%d", updated, len(rows))
|
||||
|
||||
_run_with_timeout(_run, label="sxyprn-thumb-refresh")
|
||||
|
||||
|
||||
def _job_bulk_dedup_performers() -> None:
|
||||
"""Pair-wise dedup po performer overlap — safety net dla duplikatów które
|
||||
resolver-time scoring nie złapał.
|
||||
|
|
@ -381,12 +423,30 @@ def build_scheduler(cfg: dict[str, Any]) -> BlockingScheduler:
|
|||
)
|
||||
log.info("scheduler: reap-stuck every %dh", reap_hours)
|
||||
|
||||
# sxyprn thumbnail refresh — sxyprn miniaturki rotują (signed CDN, 404 po ~tygodniach).
|
||||
# Domyślnie ZAWSZE on co 12h, batch najdawniej-aktualizowanych → cykl po katalogu w
|
||||
# ~tydzień (mieści się w oknie wygaśnięcia). Bug 2026-06-10.
|
||||
sxyprn_hours = cfg.get("sxyprn_thumb_refresh_hours", 12)
|
||||
if sxyprn_hours:
|
||||
batch = cfg.get("sxyprn_thumb_refresh_batch", 1200)
|
||||
sched.add_job(
|
||||
lambda: _job_refresh_sxyprn_thumbs(batch),
|
||||
IntervalTrigger(hours=sxyprn_hours, start_date=INTERVAL_ANCHOR),
|
||||
id="sxyprn_thumb_refresh",
|
||||
replace_existing=True,
|
||||
max_instances=1,
|
||||
coalesce=True,
|
||||
)
|
||||
log.info("scheduler: sxyprn-thumb-refresh every %dh (batch=%d)", sxyprn_hours, batch)
|
||||
|
||||
return sched
|
||||
|
||||
|
||||
DEFAULT_CONFIG: dict[str, Any] = {
|
||||
"tpdb_hours": 6,
|
||||
"stashdb_hours": 6,
|
||||
"sxyprn_thumb_refresh_hours": 12,
|
||||
"sxyprn_thumb_refresh_batch": 1200,
|
||||
"performer_driven_hours": 12,
|
||||
"performer_driven_top_n": 20,
|
||||
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100
|
||||
|
|
|
|||
94
scripts/refresh_sxyprn_thumbs.py
Normal file
94
scripts/refresh_sxyprn_thumbs.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
|
||||
|
||||
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
||||
~tygodniach → 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
|
||||
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
|
||||
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
|
||||
pobranie per-post page.
|
||||
|
||||
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
|
||||
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
|
||||
Thumbnaile dalej rotują → odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
|
||||
|
||||
Użycie (kontener worker):
|
||||
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import session_scope
|
||||
from app.extractors import browser_get
|
||||
|
||||
_OG_RE = re.compile(r"og:image[\"'][^>]*content=[\"']([^\"']+)", re.IGNORECASE)
|
||||
_OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image", re.IGNORECASE)
|
||||
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
||||
|
||||
|
||||
def _fresh_thumb(page_url: str) -> str | None:
|
||||
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
|
||||
martwa (Post Not Found) albo brak postera."""
|
||||
try:
|
||||
html = browser_get(page_url, timeout=25).text
|
||||
except Exception:
|
||||
return None
|
||||
if "Post Not Found" in html:
|
||||
return None
|
||||
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
||||
if not m:
|
||||
return None
|
||||
u = m.group(1).strip()
|
||||
if u.startswith("//"):
|
||||
u = "https:" + u
|
||||
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
||||
return None
|
||||
return u
|
||||
|
||||
|
||||
def main() -> None:
|
||||
limit = 1_000_000
|
||||
offset = 0
|
||||
for i, a in enumerate(sys.argv):
|
||||
if a == "--limit" and i + 1 < len(sys.argv):
|
||||
limit = int(sys.argv[i + 1])
|
||||
if a == "--offset" and i + 1 < len(sys.argv):
|
||||
offset = int(sys.argv[i + 1])
|
||||
|
||||
with session_scope() as s:
|
||||
rows = s.execute(
|
||||
text(
|
||||
"SELECT id, page_url FROM playback_sources "
|
||||
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
|
||||
"ORDER BY id OFFSET :off LIMIT :lim"
|
||||
).bindparams(off=offset, lim=limit)
|
||||
).all()
|
||||
|
||||
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
|
||||
updated = 0
|
||||
dead = 0
|
||||
nothumb = 0
|
||||
for idx, (pbid, page_url) in enumerate(rows):
|
||||
thumb = _fresh_thumb(page_url)
|
||||
if thumb:
|
||||
with session_scope() as s:
|
||||
s.execute(
|
||||
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||
t=thumb, i=pbid
|
||||
)
|
||||
)
|
||||
s.commit()
|
||||
updated += 1
|
||||
else:
|
||||
nothumb += 1
|
||||
if (idx + 1) % 200 == 0:
|
||||
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
|
||||
time.sleep(0.25)
|
||||
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue