feat(sxyprn): mark dead posts during thumbnail refresh sweep
resolve_post() now distinguishes "Post Not Found" (mark dead_at — the link wouldn't play anyway) from a live page with no fresh poster (leave untouched), on top of the existing thumbnail refresh. Batched into refresh_batch() with refreshed/dead/untouched counters. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
956a0feb22
commit
a9f0f94321
1 changed files with 62 additions and 37 deletions
|
|
@ -1,14 +1,18 @@
|
|||
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
|
||||
"""Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
|
||||
|
||||
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
||||
~tygodniach → 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
|
||||
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
|
||||
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
|
||||
pobranie per-post page.
|
||||
~tygodniach → 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
|
||||
`og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
|
||||
NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
|
||||
|
||||
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
|
||||
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
|
||||
Thumbnaile dalej rotują → odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
|
||||
Trzy wyniki per źródło:
|
||||
- 'thumb' → strona żyje + ma świeży poster → UPDATE thumbnail_url
|
||||
- 'dead' → "Post Not Found" (post skasowany na sxyprn) → mark dead_at (martwy link,
|
||||
nie tylko brak miniatury — i tak by nie zagrał)
|
||||
- 'none' → fetch fail / brak postera ale strona żyje → zostaw bez zmian
|
||||
|
||||
Wznawialne (--offset), idempotent. Thumbnaile rotują → odpalać periodycznie
|
||||
(_job_refresh_sxyprn_thumbs co 12h).
|
||||
|
||||
Użycie (kontener worker):
|
||||
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
||||
|
|
@ -29,24 +33,53 @@ _OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image",
|
|||
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
||||
|
||||
|
||||
def _fresh_thumb(page_url: str) -> str | None:
|
||||
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
|
||||
martwa (Post Not Found) albo brak postera."""
|
||||
def resolve_post(page_url: str) -> tuple[str, str | None]:
|
||||
"""Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
|
||||
try:
|
||||
html = browser_get(page_url, timeout=25).text
|
||||
except Exception:
|
||||
return None
|
||||
return ("none", None)
|
||||
if "Post Not Found" in html:
|
||||
return None
|
||||
return ("dead", None)
|
||||
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
||||
if not m:
|
||||
return None
|
||||
return ("none", None)
|
||||
u = m.group(1).strip()
|
||||
if u.startswith("//"):
|
||||
u = "https:" + u
|
||||
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
||||
return None
|
||||
return u
|
||||
return ("none", None)
|
||||
return ("thumb", u)
|
||||
|
||||
|
||||
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
|
||||
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
|
||||
refreshed = marked_dead = untouched = 0
|
||||
for pbid, page_url in rows:
|
||||
status, val = resolve_post(page_url)
|
||||
if status == "thumb":
|
||||
with session_scope() as s:
|
||||
s.execute(
|
||||
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||
t=val, i=pbid
|
||||
)
|
||||
)
|
||||
s.commit()
|
||||
refreshed += 1
|
||||
elif status == "dead":
|
||||
with session_scope() as s:
|
||||
s.execute(
|
||||
text(
|
||||
"UPDATE playback_sources SET dead_at=now(), "
|
||||
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
|
||||
).bindparams(i=pbid)
|
||||
)
|
||||
s.commit()
|
||||
marked_dead += 1
|
||||
else:
|
||||
untouched += 1
|
||||
time.sleep(0.25)
|
||||
return refreshed, marked_dead, untouched
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -67,27 +100,19 @@ def main() -> None:
|
|||
).bindparams(off=offset, lim=limit)
|
||||
).all()
|
||||
|
||||
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
|
||||
updated = 0
|
||||
dead = 0
|
||||
nothumb = 0
|
||||
for idx, (pbid, page_url) in enumerate(rows):
|
||||
thumb = _fresh_thumb(page_url)
|
||||
if thumb:
|
||||
with session_scope() as s:
|
||||
s.execute(
|
||||
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||
t=thumb, i=pbid
|
||||
)
|
||||
)
|
||||
s.commit()
|
||||
updated += 1
|
||||
else:
|
||||
nothumb += 1
|
||||
if (idx + 1) % 200 == 0:
|
||||
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
|
||||
time.sleep(0.25)
|
||||
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
|
||||
print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
|
||||
refreshed = marked_dead = untouched = 0
|
||||
CHUNK = 200
|
||||
for i in range(0, len(rows), CHUNK):
|
||||
r, d, u = refresh_batch(rows[i : i + CHUNK])
|
||||
refreshed += r
|
||||
marked_dead += d
|
||||
untouched += u
|
||||
print(
|
||||
f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
|
||||
flush=True,
|
||||
)
|
||||
print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue