feat(sxyprn): mark dead posts during thumbnail refresh sweep

resolve_post() now distinguishes "Post Not Found" (mark dead_at — the
link wouldn't play anyway) from a live page with no fresh poster (leave
untouched), on top of the existing thumbnail refresh. Batched into
refresh_batch() with refreshed/dead/untouched counters.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-10 19:20:28 +02:00
parent 956a0feb22
commit a9f0f94321

View file

@ -1,14 +1,18 @@
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
"""Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile podpisane czasowo i ROTUJĄ (token wygasa po
~tygodniach 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
pobranie per-post page.
~tygodniach 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
`og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
Thumbnaile dalej rotują odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
Trzy wyniki per źródło:
- 'thumb' strona żyje + ma świeży poster UPDATE thumbnail_url
- 'dead' "Post Not Found" (post skasowany na sxyprn) mark dead_at (martwy link,
nie tylko brak miniatury i tak by nie zagrał)
- 'none' fetch fail / brak postera ale strona żyje zostaw bez zmian
Wznawialne (--offset), idempotent. Thumbnaile rotują odpalać periodycznie
(_job_refresh_sxyprn_thumbs co 12h).
Użycie (kontener worker):
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
@ -29,24 +33,53 @@ _OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image",
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
def _fresh_thumb(page_url: str) -> str | None:
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
martwa (Post Not Found) albo brak postera."""
def resolve_post(page_url: str) -> tuple[str, str | None]:
"""Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
try:
html = browser_get(page_url, timeout=25).text
except Exception:
return None
return ("none", None)
if "Post Not Found" in html:
return None
return ("dead", None)
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
if not m:
return None
return ("none", None)
u = m.group(1).strip()
if u.startswith("//"):
u = "https:" + u
if "trafficdeposit.com" not in u and "sxyprn" not in u:
return None
return u
return ("none", None)
return ("thumb", u)
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
refreshed = marked_dead = untouched = 0
for pbid, page_url in rows:
status, val = resolve_post(page_url)
if status == "thumb":
with session_scope() as s:
s.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=val, i=pbid
)
)
s.commit()
refreshed += 1
elif status == "dead":
with session_scope() as s:
s.execute(
text(
"UPDATE playback_sources SET dead_at=now(), "
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
).bindparams(i=pbid)
)
s.commit()
marked_dead += 1
else:
untouched += 1
time.sleep(0.25)
return refreshed, marked_dead, untouched
def main() -> None:
@ -67,27 +100,19 @@ def main() -> None:
).bindparams(off=offset, lim=limit)
).all()
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
updated = 0
dead = 0
nothumb = 0
for idx, (pbid, page_url) in enumerate(rows):
thumb = _fresh_thumb(page_url)
if thumb:
with session_scope() as s:
s.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=thumb, i=pbid
)
)
s.commit()
updated += 1
else:
nothumb += 1
if (idx + 1) % 200 == 0:
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
time.sleep(0.25)
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
refreshed = marked_dead = untouched = 0
CHUNK = 200
for i in range(0, len(rows), CHUNK):
r, d, u = refresh_batch(rows[i : i + CHUNK])
refreshed += r
marked_dead += d
untouched += u
print(
f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
flush=True,
)
print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
if __name__ == "__main__":