feat(sxyprn): mark dead posts during thumbnail refresh sweep
resolve_post() now distinguishes "Post Not Found" (mark dead_at — the link wouldn't play anyway) from a live page with no fresh poster (leave untouched), on top of the existing thumbnail refresh. Batched into refresh_batch() with refreshed/dead/untouched counters. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
956a0feb22
commit
a9f0f94321
1 changed files with 62 additions and 37 deletions
|
|
@ -1,14 +1,18 @@
|
||||||
"""Odśwież wygasłe miniaturki sxyprn z ich (żywych) stron post (bug 2026-06-10).
|
"""Odśwież wygasłe miniaturki sxyprn z (żywych) stron post + oznacz martwe (bug 2026-06-10).
|
||||||
|
|
||||||
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
sxyprn/trafficdeposit thumbnaile są podpisane czasowo i ROTUJĄ (token wygasa po
|
||||||
~tygodniach → 404), ALE sama strona /post/<id>.html ŻYJE (200) i zawiera świeży
|
~tygodniach → 404), ALE żywa strona /post/<id>.html zawiera świeży poster sceny w
|
||||||
poster sceny w `og:image` / `<video poster=>` (post-id w ścieżce, current timestamp).
|
`og:image` / `<video poster>` (post-id w ścieżce, current timestamp). Search/listingi
|
||||||
Search/listingi NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to
|
NIE re-surfaceują starych postów (0 overlap), więc jedyna droga to per-post page fetch.
|
||||||
pobranie per-post page.
|
|
||||||
|
|
||||||
Strategia: iteruj żywe sxyprn playback_sources, fetch post page, wyłuskaj og:image
|
Trzy wyniki per źródło:
|
||||||
(fresh signed thumbnail), UPDATE thumbnail_url. Wznawialne (--offset), idempotent.
|
- 'thumb' → strona żyje + ma świeży poster → UPDATE thumbnail_url
|
||||||
Thumbnaile dalej rotują → odpalać periodycznie (scheduled job _job_refresh_sxyprn_thumbs).
|
- 'dead' → "Post Not Found" (post skasowany na sxyprn) → mark dead_at (martwy link,
|
||||||
|
nie tylko brak miniatury — i tak by nie zagrał)
|
||||||
|
- 'none' → fetch fail / brak postera ale strona żyje → zostaw bez zmian
|
||||||
|
|
||||||
|
Wznawialne (--offset), idempotent. Thumbnaile rotują → odpalać periodycznie
|
||||||
|
(_job_refresh_sxyprn_thumbs co 12h).
|
||||||
|
|
||||||
Użycie (kontener worker):
|
Użycie (kontener worker):
|
||||||
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
python scripts/refresh_sxyprn_thumbs.py [--limit N] [--offset M]
|
||||||
|
|
@ -29,24 +33,53 @@ _OG_RE2 = re.compile(r"content=[\"']([^\"']+)[\"'][^>]*property=[\"']og:image",
|
||||||
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
_POSTER_RE = re.compile(r"<video[^>]*poster=[\"']([^\"']+)", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def _fresh_thumb(page_url: str) -> str | None:
|
def resolve_post(page_url: str) -> tuple[str, str | None]:
|
||||||
"""Pobierz post page → świeży poster (og:image / video poster). None gdy strona
|
"""Zwraca ('thumb', url) | ('dead', None) | ('none', None)."""
|
||||||
martwa (Post Not Found) albo brak postera."""
|
|
||||||
try:
|
try:
|
||||||
html = browser_get(page_url, timeout=25).text
|
html = browser_get(page_url, timeout=25).text
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return ("none", None)
|
||||||
if "Post Not Found" in html:
|
if "Post Not Found" in html:
|
||||||
return None
|
return ("dead", None)
|
||||||
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
m = _OG_RE.search(html) or _OG_RE2.search(html) or _POSTER_RE.search(html)
|
||||||
if not m:
|
if not m:
|
||||||
return None
|
return ("none", None)
|
||||||
u = m.group(1).strip()
|
u = m.group(1).strip()
|
||||||
if u.startswith("//"):
|
if u.startswith("//"):
|
||||||
u = "https:" + u
|
u = "https:" + u
|
||||||
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
if "trafficdeposit.com" not in u and "sxyprn" not in u:
|
||||||
return None
|
return ("none", None)
|
||||||
return u
|
return ("thumb", u)
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_batch(rows: list[tuple]) -> tuple[int, int, int]:
|
||||||
|
"""rows: [(pb_id, page_url), ...]. Returns (refreshed, marked_dead, untouched)."""
|
||||||
|
refreshed = marked_dead = untouched = 0
|
||||||
|
for pbid, page_url in rows:
|
||||||
|
status, val = resolve_post(page_url)
|
||||||
|
if status == "thumb":
|
||||||
|
with session_scope() as s:
|
||||||
|
s.execute(
|
||||||
|
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
||||||
|
t=val, i=pbid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
s.commit()
|
||||||
|
refreshed += 1
|
||||||
|
elif status == "dead":
|
||||||
|
with session_scope() as s:
|
||||||
|
s.execute(
|
||||||
|
text(
|
||||||
|
"UPDATE playback_sources SET dead_at=now(), "
|
||||||
|
"dead_reason='sxyprn Post Not Found (refresh sweep)' WHERE id=:i"
|
||||||
|
).bindparams(i=pbid)
|
||||||
|
)
|
||||||
|
s.commit()
|
||||||
|
marked_dead += 1
|
||||||
|
else:
|
||||||
|
untouched += 1
|
||||||
|
time.sleep(0.25)
|
||||||
|
return refreshed, marked_dead, untouched
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
|
@ -67,27 +100,19 @@ def main() -> None:
|
||||||
).bindparams(off=offset, lim=limit)
|
).bindparams(off=offset, lim=limit)
|
||||||
).all()
|
).all()
|
||||||
|
|
||||||
print(f"sxyprn sources to refresh: {len(rows)} (offset={offset})", flush=True)
|
print(f"sxyprn sources: {len(rows)} (offset={offset})", flush=True)
|
||||||
updated = 0
|
refreshed = marked_dead = untouched = 0
|
||||||
dead = 0
|
CHUNK = 200
|
||||||
nothumb = 0
|
for i in range(0, len(rows), CHUNK):
|
||||||
for idx, (pbid, page_url) in enumerate(rows):
|
r, d, u = refresh_batch(rows[i : i + CHUNK])
|
||||||
thumb = _fresh_thumb(page_url)
|
refreshed += r
|
||||||
if thumb:
|
marked_dead += d
|
||||||
with session_scope() as s:
|
untouched += u
|
||||||
s.execute(
|
print(
|
||||||
text("UPDATE playback_sources SET thumbnail_url=:t WHERE id=:i").bindparams(
|
f" {min(i+CHUNK, len(rows))}/{len(rows)} refreshed={refreshed} dead={marked_dead} untouched={untouched}",
|
||||||
t=thumb, i=pbid
|
flush=True,
|
||||||
)
|
)
|
||||||
)
|
print(f"DONE refreshed={refreshed} dead={marked_dead} untouched={untouched}", flush=True)
|
||||||
s.commit()
|
|
||||||
updated += 1
|
|
||||||
else:
|
|
||||||
nothumb += 1
|
|
||||||
if (idx + 1) % 200 == 0:
|
|
||||||
print(f" {idx+1}/{len(rows)} updated={updated} no_thumb={nothumb}", flush=True)
|
|
||||||
time.sleep(0.25)
|
|
||||||
print(f"DONE refreshed={updated}/{len(rows)} no_thumb={nothumb}", flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue