goon/scripts/refresh_sxyprn_thumbs.py
jtrzupek a9f0f94321 feat(sxyprn): mark dead posts during thumbnail refresh sweep
resolve_post() now distinguishes "Post Not Found" (mark dead_at — the
link wouldn't play anyway) from a live page with no fresh poster (leave
untouched), on top of the existing thumbnail refresh. Batched into
refresh_batch() with refreshed/dead/untouched counters.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 19:20:28 +02:00

119 lines
4.3 KiB
Python

"""Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
~tygodniach → 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
`og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
Trzy wyniki per źródło:
- 'thumb' → strona żyje + ma świeży poster → UPDATE thumbnail_url
- 'dead'"Post Not Found" (post skasowany na sxyprn) → mark dead_at (martwy link,
nie tylko brak miniatury — i tak by nie zagrał)
- 'none' → fetch fail / brak postera ale strona żyje → zostaw bez zmian
Wznawialne (--offset), idempotent. Thumbnaile rotują → odpalać periodycznie
(_job_refresh_sxyprn_thumbs co 12h).
Użycie (kontener worker):
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
"""
from __future__ import annotations
import re
import sys
import time
from sqlalchemy import text
from app.db import session_scope
from app.extractors import browser_get
_OG_RE = re.compile(r"og:image[\"'][^>]*content=[\"']([^\"']+)", re.IGNORECASE)
_OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image", re.IGNORECASE)
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
def resolve_post(page_url: str) -> tuple[str, str | None]:
"""Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
try:
html = browser_get(page_url, timeout=25).text
except Exception:
return ("none", None)
if "Post Not Found" in html:
return ("dead", None)
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
if not m:
return ("none", None)
u = m.group(1).strip()
if u.startswith("//"):
u = "https:" + u
if "trafficdeposit.com" not in u and "sxyprn" not in u:
return ("none", None)
return ("thumb", u)
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
refreshed = marked_dead = untouched = 0
for pbid, page_url in rows:
status, val = resolve_post(page_url)
if status == "thumb":
with session_scope() as s:
s.execute(
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
t=val, i=pbid
)
)
s.commit()
refreshed += 1
elif status == "dead":
with session_scope() as s:
s.execute(
text(
"UPDATE playback_sources SET dead_at=now(), "
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
).bindparams(i=pbid)
)
s.commit()
marked_dead += 1
else:
untouched += 1
time.sleep(0.25)
return refreshed, marked_dead, untouched
def main() -> None:
limit = 1_000_000
offset = 0
for i, a in enumerate(sys.argv):
if a == "--limit" and i + 1 < len(sys.argv):
limit = int(sys.argv[i + 1])
if a == "--offset" and i + 1 < len(sys.argv):
offset = int(sys.argv[i + 1])
with session_scope() as s:
rows = s.execute(
text(
"SELECT id, page_url FROM playback_sources "
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
"ORDER BY id OFFSET :off LIMIT :lim"
).bindparams(off=offset, lim=limit)
).all()
print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
refreshed = marked_dead = untouched = 0
CHUNK = 200
for i in range(0, len(rows), CHUNK):
r, d, u = refresh_batch(rows[i : i + CHUNK])
refreshed += r
marked_dead += d
untouched += u
print(
f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
flush=True,
)
print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
if __name__ == "__main__":
main()