feat(ingest): disable hqfap/4k69 (broken playback), latestpornvideo → browse

- hqfap + 4k69: both ingested fresh but playback is dead (hqfap serves a fixed
  ~3MB "server down" stub for every scene; 4k69 resolves no playable URL).
  Removed from ALL_BROWSE_SCRAPERS so no new dead sources get ingested; existing
  live playback_sources marked dead in prod (scenes drop out of has_playback /
  Sites). Extractors kept in registry for easy re-enable if the hosts recover.
- latestpornvideo: was a performer-search scraper, so it never picked up the
  site's "latest" feed — users saw a stale set. Converted to a browse scraper
  reading /page/N/ (studio+date from title/thumb, category tags; performers via
  canonical merge). Moved DIRECT → BROWSE list.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jtrzupek 2026-06-22 09:34:47 +02:00
parent 4afebacad8
commit f34a75f4c6
2 changed files with 111 additions and 105 deletions

View file

@ -101,7 +101,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan. # XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek # Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled. # match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
LatestPornVideoScraper, # LatestPornVideoScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
# user 1da0375e: search-driven nie brał feedu "latest" → stary zestaw w apce).
# LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0% # LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0%
# canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany # canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany
# usunięte (~15k scen). # usunięte (~15k scen).
@ -148,8 +149,8 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402 from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402 from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402 from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402 from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402 from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402 from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402 from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402 from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
@ -157,6 +158,12 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
FreshpornoScraper, FreshpornoScraper,
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
# nie brał feedu "latest"). Listing card: tytuł (z embedded "<Studio> YY MM DD"),
# thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma
# czysto (brak `actors-*`) → puste, dorabia canonical-merge. Playback: luluvid
# iframe → extractor latestpornvideocom (_embed_iframe) → telefon resolwuje.
LatestPornVideoScraper,
# SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski # SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski
# zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page/<n>/`. # zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page/<n>/`.
# Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/ # Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/
@ -207,18 +214,17 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
# Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie # Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
# full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.) # full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
XVideosBrowseScraper, XVideosBrowseScraper,
# HQFapScraper — dołączony 2026-06-10 (user request). PlayTube CMS, ~120k scen # HQFapScraper / FourK69Scraper — WYŁĄCZONE 2026-06-22 (user request, na razie).
# (re-upload pornhd.pet). JSON-LD VideoObject (title+uploadDate+duration+thumb+ # Oba na PlayTube CMS, ingestowały świeżo i wyglądały żywo, ALE playback w obu padł:
# contentUrl) + pille Pornstars/Categories na detail page. Listing nie paginuje # - hqfap: hosting migrował na `/upload/videos/video_down.mp4` = STAŁY ~3MB stub
# się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4 # "server down" dla KAŻDEJ sceny (extractor go odrzuca → None),
# (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`. # - 4k69: get_file nie zwraca już grywalnego URL (extractor resolves nothing → None).
HQFapScraper, # Scena bez grywalnego źródła = śmieciowy wpis, więc nie ingestujemy nowych. Istniejące
# FourK69Scraper — dołączony 2026-06-10 (user request). Probe 2026-06-01 odrzucił # live playback_sources oznaczone dead na prodzie (znikają z /sources + has_playback).
# po homepage "JS-rendered" — błędnie: scene pages mają pełny SSR + JSON-LD. Ta sama # Reversible: odkomentuj + odżyw sources gdy hosting wróci. Extractory zostają w
# platforma PlayTube co hqfap (wspólna baza _playtube.py), ~65k scen, content głównie # _REGISTRY (hqfapcom/4k69com) — gotowe gdyby content wrócił.
# studyjny (4K paysite re-upload). Studio z kategorii matchowanych do listy /studios. # HQFapScraper,
# Stream get_file (www.4kporno.xxx) jak fullmovies → mobile_direct, skip 2160p. # FourK69Scraper,
FourK69Scraper,
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/ # NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration # porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa # meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa

View file

@ -1,35 +1,41 @@
"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven). """latestpornvideo.com — latest-vids browse scraper.
2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.), Historia: dawniej performer-driven search scraper (`/actor/<slug>/`). Problem
nie sceny. Sceny to `/<post_id>/` (numeryczne). Czytamy listing performera (user-report 1da0375e): search-scraper ingestuje TYLKO sceny performerów, których
`/actor/<slug>/` i parsujemy karty `<article>`. akurat szukamy feed strony "latest" nigdy nie wpada, w apce widać stary zestaw,
a na stronie jest świeży. Przerobione na BROWSE (latest chronologicznie z
`/page/<n>/`, page 1 = `/`), 2026-06-22.
Metadane z karty (listing, bez detail-fetcha): Listing card (zero detail-fetchy detail page nie ma performerów ani duration):
- klasa `<article>`: `actors-<slug>` (multi) performerzy; `tag-<slug>` (multi) + <article class="... post-<id> ... category-<cat> tag-<x> tag-<y> ...">
`category-<slug>` tagi (filtrujemy fragmenty imienia performera) <a href="https://latestpornvideo.com/<id>/" title="<Tytuł>">
- `<a href title="...">` URL sceny (/<id>/) + tytuł data-main-thumb="<Studio>-YYYY-MM-DD-...-cover.jpg"
- `data-main-thumb` thumbnail; jego nazwa pliku koduje `<Studio>-YYYY-MM-DD-...` tytuł, miniatura, studio+release_date (z nazwy thumba albo z tytułu
wyłuskujemy studio + release_date (gdy pasuje wzorzec) "<Studio> YY MM DD ..."). Performerzy: listing ICH NIE MA czysto
(homepage karty bez `actors-*`, jak na stronach /actor/), a `tag-*` miesza
fragmenty imion z gatunkami NIE ufamy tagom jako performerom; performera
dorabia canonical-merge po tytule+duration. Tagi bierzemy ostrożnie.
Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom` Playback: luluvid (filemoon family) iframe extractor `latestpornvideocom`
(_embed_iframe luluvid/hoster, phone-side). (_embed_iframe type='hoster'), telefon resolwuje phone-side. page_url = /<id>/.
""" """
from __future__ import annotations from __future__ import annotations
import html import html
import logging import logging
import re import re
from collections.abc import Iterator
from datetime import date from datetime import date
from app.connectors.base import ( from app.connectors.base import (
RawPerformer,
RawPlaybackSource, RawPlaybackSource,
RawScene, RawScene,
RawStudio, RawStudio,
RawTag, RawTag,
) )
from app.connectors.direct_scrapers._search_base import BaseSearchScraper from app.connectors.direct_scrapers._browse_base import (
BaseBrowseScraper,
compute_thumbnail_phash,
)
from app.extractors import browser_get from app.extractors import browser_get
from app.normalize.text import slugify from app.normalize.text import slugify
@ -39,42 +45,45 @@ _BASE = "https://latestpornvideo.com"
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE) _ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE) _LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE) _THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
_CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)")
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)") _CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)") _CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg` (np. Analized-2021-01-09-Amirah-...). # Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg`.
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE) _THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger"). # Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger").
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b") _TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
# Karty homepage zawsze siedzą w kategorii "latest-porn-videos" — to nie jest tag.
_CAT_SKIP = {"latest-porn-videos", "uncategorized", ""}
def _name_from_slug(slug: str) -> str: def _name_from_slug(slug: str) -> str:
return " ".join(w.capitalize() for w in slug.split("-") if w) return " ".join(w.capitalize() for w in slug.split("-") if w)
class LatestPornVideoScraper(BaseSearchScraper): class LatestPornVideoScraper(BaseBrowseScraper):
sitetag = "latestpornvideocom" sitetag = "latestpornvideocom"
def search( def _listing_url(self, page: int) -> str:
self, query: str, *, page: int = 1, limit: int | None = None return _BASE + "/" if page <= 1 else f"{_BASE}/page/{page}/"
) -> Iterator[RawScene]:
actor_slug = slugify(query) # crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji.
if not actor_slug: def _extract_scene_urls(self, listing_html: str) -> list[str]:
return return [m.group(1) for m in _LINK_RE.finditer(listing_html)]
url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
try: def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
r = browser_get(url, timeout=self._timeout) return None
except Exception as e:
log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e) def crawl_page(self, page: int) -> list[RawScene] | None:
return url = self._listing_url(page)
if r.status_code != 200: try:
return res = browser_get(url, timeout=self._timeout)
text = res.text if hasattr(res, "text") else res
except Exception as e:
log.warning("latestpornvideo browse listing fetch failed (page %d): %s", page, e)
return None
text = r.text
anchors = list(_ARTICLE_RE.finditer(text)) anchors = list(_ARTICLE_RE.finditer(text))
out: list[RawScene] = []
seen: set[str] = set() seen: set[str] = set()
yielded = 0
for idx, m in enumerate(anchors): for idx, m in enumerate(anchors):
cls = m.group(1) cls = m.group(1)
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500 win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
@ -84,7 +93,8 @@ class LatestPornVideoScraper(BaseSearchScraper):
if not link_m: if not link_m:
continue continue
scene_url = link_m.group(1).rstrip("/") + "/" scene_url = link_m.group(1).rstrip("/") + "/"
if not scene_url.startswith(_BASE) or scene_url in seen: # tylko właściwe posty scen (/<digits>/), bez nav/kategorii
if not re.fullmatch(rf"{re.escape(_BASE)}/\d+/", scene_url) or scene_url in seen:
continue continue
seen.add(scene_url) seen.add(scene_url)
title = html.unescape(link_m.group(2)).strip() title = html.unescape(link_m.group(2)).strip()
@ -94,42 +104,12 @@ class LatestPornVideoScraper(BaseSearchScraper):
thumb_m = _THUMB_RE.search(window) thumb_m = _THUMB_RE.search(window)
thumb = thumb_m.group(1) if thumb_m else None thumb = thumb_m.group(1) if thumb_m else None
# Performerzy z klasy.
performers: list[RawPerformer] = []
perf_tokens: set[str] = set()
seen_perf: set[str] = set()
for am in _CLASS_ACTOR_RE.finditer(cls):
sl = am.group(1)
if sl in seen_perf:
continue
seen_perf.add(sl)
perf_tokens.update(sl.split("-"))
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl))
)
if not performers:
perf_tokens.update(actor_slug.split("-"))
performers.append(
RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip())
)
# Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera.
tags: list[RawTag] = []
seen_tag: set[str] = set()
for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)):
sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1))
if not sl or sl in seen_tag or sl in perf_tokens:
continue
seen_tag.add(sl)
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`). # Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
studio: RawStudio | None = None studio: RawStudio | None = None
release_date: date | None = None release_date: date | None = None
if thumb and (tn := _THUMB_NAME_RE.search(thumb)): if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
studio_raw = tn.group(1).replace("-", " ").strip() studio_raw = tn.group(1).replace("-", " ").strip()
# Pomiń gdy "studio" to w istocie imię performera. if studio_raw:
if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}:
studio = RawStudio( studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw), name=studio_raw, slug=slugify(studio_raw),
@ -139,7 +119,7 @@ class LatestPornVideoScraper(BaseSearchScraper):
except ValueError: except ValueError:
release_date = None release_date = None
# Fallback z tytułu: `<Studio> YY MM DD ...` gdy thumb nie dał studio/daty. # Fallback z tytułu: `<Studio> YY MM DD ...`.
if studio is None or release_date is None: if studio is None or release_date is None:
if tm2 := _TITLE_DATE_RE.search(title): if tm2 := _TITLE_DATE_RE.search(title):
if release_date is None: if release_date is None:
@ -150,31 +130,51 @@ class LatestPornVideoScraper(BaseSearchScraper):
except ValueError: except ValueError:
release_date = None release_date = None
studio_raw = tm2.group(1).strip(" -") studio_raw = tm2.group(1).strip(" -")
if ( if studio is None and 2 <= len(studio_raw) <= 30:
studio is None and 2 <= len(studio_raw) <= 30
and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}
):
studio = RawStudio( studio = RawStudio(
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}", external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
name=studio_raw, slug=slugify(studio_raw), name=studio_raw, slug=slugify(studio_raw),
) )
yield RawScene( # Tagi: tylko prawdziwe kategorie (category-*), bez "latest-porn-videos".
external_id=f"{self.sitetag}:{scene_url}", # `tag-*` POMIJAMY — to mieszanka fragmentów imion performerów i gatunków,
title=title, # bez `actors-*` (jak na /actor/) nie da się ich rozdzielić → byłby szum.
release_date=release_date, tags: list[RawTag] = []
url=scene_url, seen_tag: set[str] = set()
studio=studio, for cm in _CLASS_CAT_RE.finditer(cls):
performers=performers, sl = cm.group(1)
tags=tags, if sl in _CAT_SKIP or sl in seen_tag:
playback_sources=[ continue
RawPlaybackSource( seen_tag.add(sl)
origin=f"tube:{self.sitetag}", tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
page_url=scene_url,
thumbnail_url=thumb, fingerprints = []
) if thumb:
], ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
if ph:
from app.connectors.base import RawFingerprint
fingerprints.append(RawFingerprint(kind="phash", value=ph))
out.append(
RawScene(
external_id=f"{self.sitetag}:{scene_url}",
title=title,
release_date=release_date,
url=scene_url,
studio=studio,
performers=[],
tags=tags,
fingerprints=fingerprints,
playback_sources=[
RawPlaybackSource(
origin=f"tube:{self.sitetag}",
page_url=scene_url,
thumbnail_url=thumb,
)
],
)
) )
yielded += 1
if limit is not None and yielded >= limit: log.info("latestpornvideo browse page %d: %d scenes", page, len(out))
return return out