feat(ingest): disable hqfap/4k69 (broken playback), latestpornvideo → browse
- hqfap + 4k69: both ingested fresh but playback is dead (hqfap serves a fixed ~3MB "server down" stub for every scene; 4k69 resolves no playable URL). Removed from ALL_BROWSE_SCRAPERS so no new dead sources get ingested; existing live playback_sources marked dead in prod (scenes drop out of has_playback / Sites). Extractors kept in registry for easy re-enable if the hosts recover. - latestpornvideo: was a performer-search scraper, so it never picked up the site's "latest" feed — users saw a stale set. Converted to a browse scraper reading /page/N/ (studio+date from title/thumb, category tags; performers via canonical merge). Moved DIRECT → BROWSE list. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4afebacad8
commit
f34a75f4c6
2 changed files with 111 additions and 105 deletions
|
|
@ -101,7 +101,8 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
# XxxFreeWatchScraper — wyłączony 2026-05-18. 790 scen, 0% canonical match, 100% solo-orphan.
|
||||||
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
# Cloudflare 403 z VPS IP, mobile WebView teoretycznie działa ale 0/790 scen miało jakikolwiek
|
||||||
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
# match do TPDB/StashDB. Pure orphan factory. Solo scenes deleted, scraper disabled.
|
||||||
LatestPornVideoScraper,
|
# LatestPornVideoScraper — przeniesiony do ALL_BROWSE_SCRAPERS (browse-konwersja 2026-06-22,
|
||||||
|
# user 1da0375e: search-driven nie brał feedu "latest" → stary zestaw w apce).
|
||||||
# LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0%
|
# LatestLeaksScraper — wyłączony 2026-05-12 (source quality report): 16,438 scen, 0.0%
|
||||||
# canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany
|
# canonical match. Slug-concat tytuły, brak studio/duration/date signali. Solo orphany
|
||||||
# usunięte (~15k scen).
|
# usunięte (~15k scen).
|
||||||
|
|
@ -148,8 +149,8 @@ from app.connectors.direct_scrapers.shyfap import ShyfapScraper # noqa: E402, F
|
||||||
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
from app.connectors.direct_scrapers.yesporn import YesPornVipScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
from app.connectors.direct_scrapers.fullmovies import FullmoviesScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
from app.connectors.direct_scrapers.hdporngg import HDPornGGScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402
|
from app.connectors.direct_scrapers.fourk69 import FourK69Scraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
|
||||||
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402
|
from app.connectors.direct_scrapers.hqfap import HQFapScraper # noqa: E402,F401 — disabled 2026-06-22 (broken playback), kept for backref/re-enable
|
||||||
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
from app.connectors.direct_scrapers.neporn import NepornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
|
from app.connectors.direct_scrapers.superporn import SuperpornScraper # noqa: E402
|
||||||
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
from app.connectors.direct_scrapers.eporner_api import EpornerApiScraper # noqa: E402
|
||||||
|
|
@ -157,6 +158,12 @@ from app.connectors.direct_scrapers.xvideos_browse import XVideosBrowseScraper
|
||||||
|
|
||||||
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
FreshpornoScraper,
|
FreshpornoScraper,
|
||||||
|
# LatestPornVideoScraper — browse od 2026-06-22 (user 1da0375e: search-driven
|
||||||
|
# nie brał feedu "latest"). Listing card: tytuł (z embedded "<Studio> YY MM DD"),
|
||||||
|
# thumb (studio+date w nazwie), category-* jako tag. Performerów listing nie ma
|
||||||
|
# czysto (brak `actors-*`) → puste, dorabia canonical-merge. Playback: luluvid
|
||||||
|
# iframe → extractor latestpornvideocom (_embed_iframe) → telefon resolwuje.
|
||||||
|
LatestPornVideoScraper,
|
||||||
# SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski
|
# SiskaScraper — re-enabled 2026-06-20 jako browse (user fa4083a2). Search siski
|
||||||
# zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page/<n>/`.
|
# zepsuty site-side (`?s=` ignoruje query), więc latest-browse z `/page/<n>/`.
|
||||||
# Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/
|
# Komplet metadanych z kafelka listingu (tytuł/duration/thumb/performer/studio/
|
||||||
|
|
@ -207,18 +214,17 @@ ALL_BROWSE_SCRAPERS: list[type[BaseBrowseScraper]] = [
|
||||||
# Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
|
# Mega-katalog ~13M → deep_crawl._PAGE_CAP["xvideoscom"]=1800 (~50k najnowszych), nie
|
||||||
# full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
|
# full-crawl. (youporn pominięty — JSON-LD bez actor/keywords, scene-perf/tagi = nav A-Z.)
|
||||||
XVideosBrowseScraper,
|
XVideosBrowseScraper,
|
||||||
# HQFapScraper — dołączony 2026-06-10 (user request). PlayTube CMS, ~120k scen
|
# HQFapScraper / FourK69Scraper — WYŁĄCZONE 2026-06-22 (user request, na razie).
|
||||||
# (re-upload pornhd.pet). JSON-LD VideoObject (title+uploadDate+duration+thumb+
|
# Oba na PlayTube CMS, ingestowały świeżo i wyglądały żywo, ALE playback w obu padł:
|
||||||
# contentUrl) + pille Pornstars/Categories na detail page. Listing nie paginuje
|
# - hqfap: hosting migrował na `/upload/videos/video_down.mp4` = STAŁY ~3MB stub
|
||||||
# się GET-em → crawl_page po sitemap index (12 plików, lastmod desc). Direct mp4
|
# "server down" dla KAŻDEJ sceny (extractor go odrzuca → None),
|
||||||
# (cdnde.com / okcdn.ru), cross-IP portable → natywny extractor `hqfapcom`.
|
# - 4k69: get_file nie zwraca już grywalnego URL (extractor resolves nothing → None).
|
||||||
HQFapScraper,
|
# Scena bez grywalnego źródła = śmieciowy wpis, więc nie ingestujemy nowych. Istniejące
|
||||||
# FourK69Scraper — dołączony 2026-06-10 (user request). Probe 2026-06-01 odrzucił
|
# live playback_sources oznaczone dead na prodzie (znikają z /sources + has_playback).
|
||||||
# po homepage "JS-rendered" — błędnie: scene pages mają pełny SSR + JSON-LD. Ta sama
|
# Reversible: odkomentuj + odżyw sources gdy hosting wróci. Extractory zostają w
|
||||||
# platforma PlayTube co hqfap (wspólna baza _playtube.py), ~65k scen, content głównie
|
# _REGISTRY (hqfapcom/4k69com) — gotowe gdyby content wrócił.
|
||||||
# studyjny (4K paysite re-upload). Studio z kategorii matchowanych do listy /studios.
|
# HQFapScraper,
|
||||||
# Stream get_file (www.4kporno.xxx) jak fullmovies → mobile_direct, skip 2160p.
|
# FourK69Scraper,
|
||||||
FourK69Scraper,
|
|
||||||
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
|
# NepornScraper — dołączony 2026-06-10 (user request). KVS engine (jak freshporno/
|
||||||
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
|
# porn00), /latest-updates/N/. JSON-LD (title+desc+uploadDate+thumb) + video:duration
|
||||||
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa
|
# meta + /models/ performerzy + /categories/ tagi. Brak studio (tytuł bywa
|
||||||
|
|
|
||||||
|
|
@ -1,35 +1,41 @@
|
||||||
"""latestpornvideo.com — performer-page listing scrape (search-based, performer-driven).
|
"""latestpornvideo.com — latest-vids browse scraper.
|
||||||
|
|
||||||
2026-06-16 fix (zamrożony od 06-13): stary regex łapał śmieci (`/wp-json` itp.),
|
Historia: dawniej performer-driven search scraper (`/actor/<slug>/`). Problem
|
||||||
nie sceny. Sceny to `/<post_id>/` (numeryczne). Czytamy listing performera
|
(user-report 1da0375e): search-scraper ingestuje TYLKO sceny performerów, których
|
||||||
`/actor/<slug>/` i parsujemy karty `<article>`.
|
akurat szukamy → feed strony "latest" nigdy nie wpada, w apce widać stary zestaw,
|
||||||
|
a na stronie jest świeży. Przerobione na BROWSE (latest chronologicznie z
|
||||||
|
`/page/<n>/`, page 1 = `/`), 2026-06-22.
|
||||||
|
|
||||||
Metadane z karty (listing, bez detail-fetcha):
|
Listing card (zero detail-fetchy — detail page nie ma performerów ani duration):
|
||||||
- klasa `<article>`: `actors-<slug>` (multi) → performerzy; `tag-<slug>` (multi) +
|
<article class="... post-<id> ... category-<cat> tag-<x> tag-<y> ...">
|
||||||
`category-<slug>` → tagi (filtrujemy fragmenty imienia performera)
|
<a href="https://latestpornvideo.com/<id>/" title="<Tytuł>">
|
||||||
- `<a href title="...">` → URL sceny (/<id>/) + tytuł
|
data-main-thumb="<Studio>-YYYY-MM-DD-...-cover.jpg"
|
||||||
- `data-main-thumb` → thumbnail; jego nazwa pliku koduje `<Studio>-YYYY-MM-DD-...`
|
→ tytuł, miniatura, studio+release_date (z nazwy thumba albo z tytułu
|
||||||
→ wyłuskujemy studio + release_date (gdy pasuje wzorzec)
|
"<Studio> YY MM DD ..."). Performerzy: listing ICH NIE MA czysto
|
||||||
|
(homepage karty bez `actors-*`, jak na stronach /actor/), a `tag-*` miesza
|
||||||
|
fragmenty imion z gatunkami → NIE ufamy tagom jako performerom; performera
|
||||||
|
dorabia canonical-merge po tytule+duration. Tagi bierzemy ostrożnie.
|
||||||
|
|
||||||
Duration NIE ma w listingu (pusty span). Playback: extractor `latestpornvideocom`
|
Playback: luluvid (filemoon family) iframe → extractor `latestpornvideocom`
|
||||||
(_embed_iframe → luluvid/hoster, phone-side).
|
(_embed_iframe → type='hoster'), telefon resolwuje phone-side. page_url = /<id>/.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import html
|
import html
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections.abc import Iterator
|
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
|
||||||
from app.connectors.base import (
|
from app.connectors.base import (
|
||||||
RawPerformer,
|
|
||||||
RawPlaybackSource,
|
RawPlaybackSource,
|
||||||
RawScene,
|
RawScene,
|
||||||
RawStudio,
|
RawStudio,
|
||||||
RawTag,
|
RawTag,
|
||||||
)
|
)
|
||||||
from app.connectors.direct_scrapers._search_base import BaseSearchScraper
|
from app.connectors.direct_scrapers._browse_base import (
|
||||||
|
BaseBrowseScraper,
|
||||||
|
compute_thumbnail_phash,
|
||||||
|
)
|
||||||
from app.extractors import browser_get
|
from app.extractors import browser_get
|
||||||
from app.normalize.text import slugify
|
from app.normalize.text import slugify
|
||||||
|
|
||||||
|
|
@ -39,42 +45,45 @@ _BASE = "https://latestpornvideo.com"
|
||||||
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
|
_ARTICLE_RE = re.compile(r'<article[^>]*\bclass="([^"]+)"', re.IGNORECASE)
|
||||||
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
|
_LINK_RE = re.compile(r'<a\s+href="([^"]+)"\s+title="([^"]+)"', re.IGNORECASE)
|
||||||
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
|
_THUMB_RE = re.compile(r'data-main-thumb="([^"]+)"', re.IGNORECASE)
|
||||||
_CLASS_ACTOR_RE = re.compile(r"\bactors-([a-z0-9-]+)")
|
|
||||||
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
|
_CLASS_TAG_RE = re.compile(r"\btag-([a-z0-9-]+)")
|
||||||
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
|
_CLASS_CAT_RE = re.compile(r"\bcategory-([a-z0-9-]+)")
|
||||||
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg` (np. Analized-2021-01-09-Amirah-...).
|
# Nazwa thumba: `<Studio>-YYYY-MM-DD-<rest>-cover.jpg`.
|
||||||
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
|
_THUMB_NAME_RE = re.compile(r"/([A-Za-z0-9][A-Za-z0-9-]*?)-(\d{4})-(\d{2})-(\d{2})-", re.IGNORECASE)
|
||||||
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
|
# Tytuł: `<Studio> YY MM DD <rest>` (np. "MySexMobile 20 10 23 Abella Danger").
|
||||||
# Studio (grupa 1) bywa puste, gdy data jest na początku ("21 01 26 Abella Danger").
|
|
||||||
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
|
_TITLE_DATE_RE = re.compile(r"^(.*?)\s*\b(\d{2})\s+(\d{2})\s+(\d{2})\b")
|
||||||
|
# Karty homepage zawsze siedzą w kategorii "latest-porn-videos" — to nie jest tag.
|
||||||
|
_CAT_SKIP = {"latest-porn-videos", "uncategorized", ""}
|
||||||
|
|
||||||
|
|
||||||
def _name_from_slug(slug: str) -> str:
|
def _name_from_slug(slug: str) -> str:
|
||||||
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
return " ".join(w.capitalize() for w in slug.split("-") if w)
|
||||||
|
|
||||||
|
|
||||||
class LatestPornVideoScraper(BaseSearchScraper):
|
class LatestPornVideoScraper(BaseBrowseScraper):
|
||||||
sitetag = "latestpornvideocom"
|
sitetag = "latestpornvideocom"
|
||||||
|
|
||||||
def search(
|
def _listing_url(self, page: int) -> str:
|
||||||
self, query: str, *, page: int = 1, limit: int | None = None
|
return _BASE + "/" if page <= 1 else f"{_BASE}/page/{page}/"
|
||||||
) -> Iterator[RawScene]:
|
|
||||||
actor_slug = slugify(query)
|
# crawl_page nadpisany → poniższe abstrakcje nieużywane, ale wymagane do instancji.
|
||||||
if not actor_slug:
|
def _extract_scene_urls(self, listing_html: str) -> list[str]:
|
||||||
return
|
return [m.group(1) for m in _LINK_RE.finditer(listing_html)]
|
||||||
url = f"{_BASE}/actor/{actor_slug}/" + (f"page/{page}/" if page > 1 else "")
|
|
||||||
try:
|
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
|
||||||
r = browser_get(url, timeout=self._timeout)
|
return None
|
||||||
except Exception as e:
|
|
||||||
log.warning("latestpornvideo actor-page fetch failed (%s): %s", url, e)
|
def crawl_page(self, page: int) -> list[RawScene] | None:
|
||||||
return
|
url = self._listing_url(page)
|
||||||
if r.status_code != 200:
|
try:
|
||||||
return
|
res = browser_get(url, timeout=self._timeout)
|
||||||
|
text = res.text if hasattr(res, "text") else res
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("latestpornvideo browse listing fetch failed (page %d): %s", page, e)
|
||||||
|
return None
|
||||||
|
|
||||||
text = r.text
|
|
||||||
anchors = list(_ARTICLE_RE.finditer(text))
|
anchors = list(_ARTICLE_RE.finditer(text))
|
||||||
|
out: list[RawScene] = []
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
yielded = 0
|
|
||||||
for idx, m in enumerate(anchors):
|
for idx, m in enumerate(anchors):
|
||||||
cls = m.group(1)
|
cls = m.group(1)
|
||||||
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
|
win_end = anchors[idx + 1].start() if idx + 1 < len(anchors) else m.end() + 1500
|
||||||
|
|
@ -84,7 +93,8 @@ class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
if not link_m:
|
if not link_m:
|
||||||
continue
|
continue
|
||||||
scene_url = link_m.group(1).rstrip("/") + "/"
|
scene_url = link_m.group(1).rstrip("/") + "/"
|
||||||
if not scene_url.startswith(_BASE) or scene_url in seen:
|
# tylko właściwe posty scen (/<digits>/), bez nav/kategorii
|
||||||
|
if not re.fullmatch(rf"{re.escape(_BASE)}/\d+/", scene_url) or scene_url in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(scene_url)
|
seen.add(scene_url)
|
||||||
title = html.unescape(link_m.group(2)).strip()
|
title = html.unescape(link_m.group(2)).strip()
|
||||||
|
|
@ -94,42 +104,12 @@ class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
thumb_m = _THUMB_RE.search(window)
|
thumb_m = _THUMB_RE.search(window)
|
||||||
thumb = thumb_m.group(1) if thumb_m else None
|
thumb = thumb_m.group(1) if thumb_m else None
|
||||||
|
|
||||||
# Performerzy z klasy.
|
|
||||||
performers: list[RawPerformer] = []
|
|
||||||
perf_tokens: set[str] = set()
|
|
||||||
seen_perf: set[str] = set()
|
|
||||||
for am in _CLASS_ACTOR_RE.finditer(cls):
|
|
||||||
sl = am.group(1)
|
|
||||||
if sl in seen_perf:
|
|
||||||
continue
|
|
||||||
seen_perf.add(sl)
|
|
||||||
perf_tokens.update(sl.split("-"))
|
|
||||||
performers.append(
|
|
||||||
RawPerformer(external_id=f"{self.sitetag}:performer:{sl}", name=_name_from_slug(sl))
|
|
||||||
)
|
|
||||||
if not performers:
|
|
||||||
perf_tokens.update(actor_slug.split("-"))
|
|
||||||
performers.append(
|
|
||||||
RawPerformer(external_id=f"{self.sitetag}:performer:{actor_slug}", name=query.strip())
|
|
||||||
)
|
|
||||||
|
|
||||||
# Tagi z klasy: tag-* + category-*; pomijamy fragmenty imienia performera.
|
|
||||||
tags: list[RawTag] = []
|
|
||||||
seen_tag: set[str] = set()
|
|
||||||
for tm in list(_CLASS_TAG_RE.finditer(cls)) + list(_CLASS_CAT_RE.finditer(cls)):
|
|
||||||
sl = re.sub(r"-(porn|leaks?|videos?)$", "", tm.group(1))
|
|
||||||
if not sl or sl in seen_tag or sl in perf_tokens:
|
|
||||||
continue
|
|
||||||
seen_tag.add(sl)
|
|
||||||
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
|
|
||||||
|
|
||||||
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
|
# Studio + release_date z nazwy thumba (`<Studio>-YYYY-MM-DD-`).
|
||||||
studio: RawStudio | None = None
|
studio: RawStudio | None = None
|
||||||
release_date: date | None = None
|
release_date: date | None = None
|
||||||
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
|
if thumb and (tn := _THUMB_NAME_RE.search(thumb)):
|
||||||
studio_raw = tn.group(1).replace("-", " ").strip()
|
studio_raw = tn.group(1).replace("-", " ").strip()
|
||||||
# Pomiń gdy "studio" to w istocie imię performera.
|
if studio_raw:
|
||||||
if studio_raw and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}:
|
|
||||||
studio = RawStudio(
|
studio = RawStudio(
|
||||||
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||||||
name=studio_raw, slug=slugify(studio_raw),
|
name=studio_raw, slug=slugify(studio_raw),
|
||||||
|
|
@ -139,7 +119,7 @@ class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
release_date = None
|
release_date = None
|
||||||
|
|
||||||
# Fallback z tytułu: `<Studio> YY MM DD ...` gdy thumb nie dał studio/daty.
|
# Fallback z tytułu: `<Studio> YY MM DD ...`.
|
||||||
if studio is None or release_date is None:
|
if studio is None or release_date is None:
|
||||||
if tm2 := _TITLE_DATE_RE.search(title):
|
if tm2 := _TITLE_DATE_RE.search(title):
|
||||||
if release_date is None:
|
if release_date is None:
|
||||||
|
|
@ -150,31 +130,51 @@ class LatestPornVideoScraper(BaseSearchScraper):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
release_date = None
|
release_date = None
|
||||||
studio_raw = tm2.group(1).strip(" -–")
|
studio_raw = tm2.group(1).strip(" -–")
|
||||||
if (
|
if studio is None and 2 <= len(studio_raw) <= 30:
|
||||||
studio is None and 2 <= len(studio_raw) <= 30
|
|
||||||
and slugify(studio_raw) not in {p.external_id.rsplit(":", 1)[1] for p in performers}
|
|
||||||
):
|
|
||||||
studio = RawStudio(
|
studio = RawStudio(
|
||||||
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
external_id=f"{self.sitetag}:studio:{slugify(studio_raw)}",
|
||||||
name=studio_raw, slug=slugify(studio_raw),
|
name=studio_raw, slug=slugify(studio_raw),
|
||||||
)
|
)
|
||||||
|
|
||||||
yield RawScene(
|
# Tagi: tylko prawdziwe kategorie (category-*), bez "latest-porn-videos".
|
||||||
external_id=f"{self.sitetag}:{scene_url}",
|
# `tag-*` POMIJAMY — to mieszanka fragmentów imion performerów i gatunków,
|
||||||
title=title,
|
# bez `actors-*` (jak na /actor/) nie da się ich rozdzielić → byłby szum.
|
||||||
release_date=release_date,
|
tags: list[RawTag] = []
|
||||||
url=scene_url,
|
seen_tag: set[str] = set()
|
||||||
studio=studio,
|
for cm in _CLASS_CAT_RE.finditer(cls):
|
||||||
performers=performers,
|
sl = cm.group(1)
|
||||||
tags=tags,
|
if sl in _CAT_SKIP or sl in seen_tag:
|
||||||
playback_sources=[
|
continue
|
||||||
RawPlaybackSource(
|
seen_tag.add(sl)
|
||||||
origin=f"tube:{self.sitetag}",
|
tags.append(RawTag(external_id=f"{self.sitetag}:tag:{sl}", name=_name_from_slug(sl), slug=sl))
|
||||||
page_url=scene_url,
|
|
||||||
thumbnail_url=thumb,
|
fingerprints = []
|
||||||
)
|
if thumb:
|
||||||
],
|
ph = compute_thumbnail_phash(thumb, referer=_BASE + "/")
|
||||||
|
if ph:
|
||||||
|
from app.connectors.base import RawFingerprint
|
||||||
|
|
||||||
|
fingerprints.append(RawFingerprint(kind="phash", value=ph))
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
RawScene(
|
||||||
|
external_id=f"{self.sitetag}:{scene_url}",
|
||||||
|
title=title,
|
||||||
|
release_date=release_date,
|
||||||
|
url=scene_url,
|
||||||
|
studio=studio,
|
||||||
|
performers=[],
|
||||||
|
tags=tags,
|
||||||
|
fingerprints=fingerprints,
|
||||||
|
playback_sources=[
|
||||||
|
RawPlaybackSource(
|
||||||
|
origin=f"tube:{self.sitetag}",
|
||||||
|
page_url=scene_url,
|
||||||
|
thumbnail_url=thumb,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
)
|
)
|
||||||
yielded += 1
|
|
||||||
if limit is not None and yielded >= limit:
|
log.info("latestpornvideo browse page %d: %d scenes", page, len(out))
|
||||||
return
|
return out
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue