resolve_post() now distinguishes "Post Not Found" (mark dead_at — the link wouldn't play anyway) from a live page with no fresh poster (leave untouched), on top of the existing thumbnail refresh. Batched into refresh_batch() with refreshed/dead/untouched counters. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
119 lines
4.3 KiB
Python
119 lines
4.3 KiB
Python
"""Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
|
|
|
|
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
|
~tygodniach → 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
|
|
`og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
|
|
NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
|
|
|
|
Trzy wyniki per źródło:
|
|
- 'thumb' → strona żyje + ma świeży poster → UPDATE thumbnail_url
|
|
- 'dead' → "Post Not Found" (post skasowany na sxyprn) → mark dead_at (martwy link,
|
|
nie tylko brak miniatury — i tak by nie zagrał)
|
|
- 'none' → fetch fail / brak postera ale strona żyje → zostaw bez zmian
|
|
|
|
Wznawialne (--offset), idempotent. Thumbnaile rotują → odpalać periodycznie
|
|
(_job_refresh_sxyprn_thumbs co 12h).
|
|
|
|
Użycie (kontener worker):
|
|
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
from sqlalchemy import text
|
|
|
|
from app.db import session_scope
|
|
from app.extractors import browser_get
|
|
|
|
_OG_RE = re.compile(r"og:image[\"'][^>]*content=[\"']([^\"']+)", re.IGNORECASE)
|
|
_OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image", re.IGNORECASE)
|
|
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
|
|
|
|
|
def resolve_post(page_url: str) -> tuple[str, str | None]:
|
|
"""Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
|
|
try:
|
|
html = browser_get(page_url, timeout=25).text
|
|
except Exception:
|
|
return ("none", None)
|
|
if "Post Not Found" in html:
|
|
return ("dead", None)
|
|
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
|
if not m:
|
|
return ("none", None)
|
|
u = m.group(1).strip()
|
|
if u.startswith("//"):
|
|
u = "https:" + u
|
|
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
|
return ("none", None)
|
|
return ("thumb", u)
|
|
|
|
|
|
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
|
|
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
|
|
refreshed = marked_dead = untouched = 0
|
|
for pbid, page_url in rows:
|
|
status, val = resolve_post(page_url)
|
|
if status == "thumb":
|
|
with session_scope() as s:
|
|
s.execute(
|
|
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
|
t=val, i=pbid
|
|
)
|
|
)
|
|
s.commit()
|
|
refreshed += 1
|
|
elif status == "dead":
|
|
with session_scope() as s:
|
|
s.execute(
|
|
text(
|
|
"UPDATE playback_sources SET dead_at=now(), "
|
|
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
|
|
).bindparams(i=pbid)
|
|
)
|
|
s.commit()
|
|
marked_dead += 1
|
|
else:
|
|
untouched += 1
|
|
time.sleep(0.25)
|
|
return refreshed, marked_dead, untouched
|
|
|
|
|
|
def main() -> None:
|
|
limit = 1_000_000
|
|
offset = 0
|
|
for i, a in enumerate(sys.argv):
|
|
if a == "--limit" and i + 1 < len(sys.argv):
|
|
limit = int(sys.argv[i + 1])
|
|
if a == "--offset" and i + 1 < len(sys.argv):
|
|
offset = int(sys.argv[i + 1])
|
|
|
|
with session_scope() as s:
|
|
rows = s.execute(
|
|
text(
|
|
"SELECT id, page_url FROM playback_sources "
|
|
"WHERE origin='tube:sxyprncom' AND dead_at IS NULL "
|
|
"ORDER BY id OFFSET :off LIMIT :lim"
|
|
).bindparams(off=offset, lim=limit)
|
|
).all()
|
|
|
|
print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
|
|
refreshed = marked_dead = untouched = 0
|
|
CHUNK = 200
|
|
for i in range(0, len(rows), CHUNK):
|
|
r, d, u = refresh_batch(rows[i : i + CHUNK])
|
|
refreshed += r
|
|
marked_dead += d
|
|
untouched += u
|
|
print(
|
|
f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
|
|
flush=True,
|
|
)
|
|
print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|