User-facing bugs resolved (per bug_reports table 2026-05-25): - 40cd28aa (short-scene filter): mobile api.ts default min_duration_sec=60 hides 6519 sub-60s scenes across all list endpoints (Performer/Site/Tag/ Browse). Caller may override with explicit 0. - 5e89ef7e (porndoe needs cookies/play click): INJECTED_JS in PlayerScreen now auto-clicks player-poster overlay (player-poster-play, big-play-button, vjs-big-play-button, jw-icon-display, btn-big-play, mejs__overlay-button, play-button, btn-play, videoPlayButton). Triggered same interval as consent-dismiss + ad-iframe removal. - b1b5e1a2 (Mixdrop czarny ekran): re-enable mixdrop direct stream via VPS curl_cffi proxy (was: skip → WebView fallback → blank screen). Backend pipeline (mixdrop.py extract + stream_proxy._curl_cffi_stream with JA3 + auto-refetch on token expire) was already complete; just removed the skip in app/api/playback.py. Plus ongoing WIP (paradisehill multi-part extraction, stream_proxy refetch logic, gesture race fix for long-press 2x speed, anti-adblock INJECTED_JS defenses, scripts for freshporno backfill, new sources API).
134 lines
4.7 KiB
Python
134 lines
4.7 KiB
Python
"""One-shot: re-extract titles dla freshporno scen z pre-fix truncation bug.
|
|
|
|
Tło: `meta_content` regex sprzed 2026-05-20 obcinał title na pierwszym apostrofie
|
|
(`<meta content="She's So Insatiable" />` → `She`). Fix wszedł 2026-05-20,
|
|
ale scenes scrapped przed fixem mają broken titles w DB. Delta-ingest skipuje
|
|
je przez external_id match — bez backfill nigdy się nie naprawią.
|
|
|
|
Bug-report `2fbf1c73` 2026-05-23 (kontekstowo, brak BE scen): część
|
|
brakujących Brazzers Exxtra scen to faktycznie pre-fix victims które nie
|
|
zmergowały z canonical TPDB record bo title się nie zgadzał.
|
|
|
|
Heurystyka:
|
|
- origin = tube:freshpornoorg
|
|
- created_at < 2026-05-20 (pre-fix)
|
|
- title length < 15
|
|
- slug freshporno URL ma więcej tokenów niż title (sygnał obcięcia)
|
|
|
|
Idempotent: po update tylko jeśli nowy title różni się od bieżącego.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from datetime import UTC, datetime
|
|
|
|
import httpx
|
|
from sqlalchemy import select
|
|
|
|
from app.connectors.direct_scrapers._browse_base import meta_content
|
|
from app.db import session_scope
|
|
from app.models import Scene
|
|
from app.models.playback_source import PlaybackSource
|
|
from app.normalize.text import normalize, slugify
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
CUTOFF_DATE = datetime(2026, 5, 20, tzinfo=UTC)
|
|
TITLE_MAX_LEN = 15
|
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/140.0.0.0"
|
|
|
|
|
|
def _slug_token_count(url: str) -> int:
|
|
"""Liczy ile tokenów ma URL slug (np. `/videos/girls-night-gets-girth/` → 4)."""
|
|
m = re.search(r"/videos/([^/]+)/?", url)
|
|
if not m:
|
|
return 0
|
|
return sum(1 for tok in m.group(1).split("-") if tok and tok != "s")
|
|
|
|
|
|
def main() -> int:
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
with session_scope() as session:
|
|
rows = session.execute(
|
|
select(Scene.id, Scene.title, PlaybackSource.page_url)
|
|
.join(PlaybackSource, PlaybackSource.scene_id == Scene.id)
|
|
.where(PlaybackSource.origin == "tube:freshpornoorg")
|
|
.where(Scene.created_at < CUTOFF_DATE)
|
|
).all()
|
|
log.info("pre-fix freshporno scenes: %d", len(rows))
|
|
|
|
# Filter: krótki title + slug ma więcej tokenów niż title (sygnał obcięcia)
|
|
candidates = []
|
|
for scene_id, title, page_url in rows:
|
|
if title is None:
|
|
continue
|
|
if len(title) >= TITLE_MAX_LEN:
|
|
continue
|
|
title_tokens = len([t for t in title.split() if t])
|
|
slug_tokens = _slug_token_count(page_url)
|
|
if slug_tokens <= title_tokens:
|
|
continue # title już ma tyle samo/więcej tokenów co slug — pewnie legit krótki
|
|
candidates.append((scene_id, title, page_url))
|
|
|
|
log.info("candidates with slug>>title heurystyka: %d", len(candidates))
|
|
|
|
client = httpx.Client(
|
|
timeout=15.0,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": USER_AGENT},
|
|
)
|
|
|
|
updated = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for scene_id, old_title, page_url in candidates:
|
|
try:
|
|
r = client.get(page_url)
|
|
if r.status_code != 200:
|
|
errors += 1
|
|
continue
|
|
new_title = meta_content(r.text, property="og:title")
|
|
if not new_title:
|
|
m = re.search(r"<h1[^>]*itemprop=\"name\"[^>]*>([^<]+)</h1>", r.text)
|
|
if m:
|
|
new_title = m.group(1).strip()
|
|
if not new_title or new_title == old_title:
|
|
skipped += 1
|
|
continue
|
|
if len(new_title) < len(old_title):
|
|
skipped += 1
|
|
continue
|
|
|
|
with session_scope() as s:
|
|
scene = s.get(Scene, scene_id)
|
|
if scene is None:
|
|
continue
|
|
log.info("update %s: %r -> %r", scene_id, scene.title, new_title)
|
|
scene.title = new_title
|
|
scene.title_normalized = normalize(new_title)
|
|
scene.slug = slugify(new_title)[:200]
|
|
updated += 1
|
|
if updated % 25 == 0:
|
|
log.info(
|
|
"progress: updated=%d skipped=%d errors=%d (%d/%d)",
|
|
updated, skipped, errors,
|
|
updated + skipped + errors, len(candidates),
|
|
)
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 5:
|
|
log.warning("scene=%s url=%s failed: %s", scene_id, page_url, e)
|
|
|
|
client.close()
|
|
log.info(
|
|
"DONE: candidates=%d updated=%d skipped=%d errors=%d",
|
|
len(candidates), updated, skipped, errors,
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|