feat(sxyprn): mark dead posts during thumbnail refresh sweep

resolve_post() now distinguishes "Post Not Found" (mark dead_at — the
link wouldn't play anyway) from a live page with no fresh poster (leave
untouched), on top of the existing thumbnail refresh. Batched into
refresh_batch() with refreshed/dead/untouched counters.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-10 19:20:28 +02:00
parent 956a0feb22
commit a9f0f94321

View file

@ -1,14 +1,18 @@
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10). """Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile podpisane czasowo i ROTUJĄ (token wygasa po sxyprn/trafficdeposit thumbnaile podpisane czasowo i ROTUJĄ (token wygasa po
~tygodniach 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży ~tygodniach 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp). `og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
pobranie per-post page.
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image Trzy wyniki per źródło:
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent. - 'thumb' strona żyje + ma świeży poster UPDATE thumbnail_url
Thumbnaile dalej rotują odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs). - 'dead' "Post Not Found" (post skasowany na sxyprn) mark dead_at (martwy link,
nie tylko brak miniatury i tak by nie zagrał)
- 'none' fetch fail / brak postera ale strona żyje zostaw bez zmian
Wznawialne (--offset), idempotent. Thumbnaile rotują odpalać periodycznie
(_job_refresh_sxyprn_thumbs co 12h).
Użycie (kontener worker): Użycie (kontener worker):
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M] python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
@ -29,24 +33,53 @@ _OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image",
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE) _POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
def _fresh_thumb(page_url: str) -> str | None: def resolve_post(page_url: str) -> tuple[str, str | None]:
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona """Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
martwa (Post Not Found) albo brak postera."""
try: try:
html = browser_get(page_url, timeout=25).text html = browser_get(page_url, timeout=25).text
except Exception: except Exception:
return None return ("none", None)
if "Post Not Found" in html: if "Post Not Found" in html:
return None return ("dead", None)
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html) m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
if not m: if not m:
return None return ("none", None)
u = m.group(1).strip() u = m.group(1).strip()
if u.startswith("//"): if u.startswith("//"):
u = "https:" + u u = "https:" + u
if "trafficdeposit.com" not in u and "sxyprn" not in u: if "trafficdeposit.com" not in u and "sxyprn" not in u:
return None return ("none", None)
return u return ("thumb", u)
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
refreshed = marked_dead = untouched = 0
for pbid, page_url in rows:
status, val = resolve_post(page_url)
if status == "thumb":
with session_scope() as s:
s.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=val, i=pbid
)
)
s.commit()
refreshed += 1
elif status == "dead":
with session_scope() as s:
s.execute(
text(
"UPDATE playback_sources SET dead_at=now(), "
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
).bindparams(i=pbid)
)
s.commit()
marked_dead += 1
else:
untouched += 1
time.sleep(0.25)
return refreshed, marked_dead, untouched
def main() -> None: def main() -> None:
@ -67,27 +100,19 @@ def main() -> None:
).bindparams(off=offset, lim=limit) ).bindparams(off=offset, lim=limit)
).all() ).all()
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True) print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
updated = 0 refreshed = marked_dead = untouched = 0
dead = 0 CHUNK = 200
nothumb = 0 for i in range(0, len(rows), CHUNK):
for idx, (pbid, page_url) in enumerate(rows): r, d, u = refresh_batch(rows[i : i + CHUNK])
thumb = _fresh_thumb(page_url) refreshed += r
if thumb: marked_dead += d
with session_scope() as s: untouched += u
s.execute( print(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams( f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
t=thumb, i=pbid flush=True,
) )
) print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
s.commit()
updated += 1
else:
nothumb += 1
if (idx + 1) % 200 == 0:
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
time.sleep(0.25)
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
if __name__ == "__main__": if __name__ == "__main__":