goon/app/connectors/direct_scrapers/_browse_base.py
jtrzupek cd12348782 fix(movies): paradisehill delta date-granularity + browse cadence docs
- paradisehill.fetch_movies compared release_date coerced to midnight against the
  `since` timestamp, so the chronological crawl stopped at the first upload dated
  the same calendar day as `since` and silently dropped most new movies (0-2 seen
  per run; Movies tab stalled). Compare by DATE with a 1-day grace instead; idempotent
  external_records upsert dedups the re-fetched recent window.
- scripts/backfill_paradisehill_movies.py: one-off no-delta deep crawl to recover the
  backlog missed during the bug (idempotent, resumable).
- docs: correct stale 'raz dziennie/24h' browse-latest comments to 6h (4x/day), the
  actual configured cadence (config.py sched_browse_latest_hours=6).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 17:00:10 +02:00

195 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""BaseBrowseScraper — latest-vids browse mode (vs search-by-performer).
Wzorzec: tube'y typu shyfap/freshporno/porn00/fullmovies/pornxp mają bogatą
metadata (title, studio, performers, tags, duration, release_date, description)
na detail page'u — wystarczy do canonical fuzzy match w resolverze. Browse mode
iteruje "latest" page (sorted by upload date) i fetchuje detail per scene.
Różnica vs `BaseSearchScraper`:
- **search**: tube wyszukuje sceny po performer name (dla performer-driven
backfill). Wymaga znanego performera.
- **browse**: tube listuje newest scenes (latest-vids endpoint). Nie wymaga
żadnego query — chodzi o świeże sceny independent of performer state.
Browse jest komplementarny do search:
- search łapie sceny dla **znanych performerów** (TPDB/StashDB → tube)
- browse łapie **świeże sceny** których performer może być new dla nas
(nowicjuszka w branży nie jeszcze w TPDB → mamy ją z browse → później
canonical TPDB ingest mergeuje)
Subclass dostarcza HTML parsing (listing → scene URLs + detail → RawScene).
"""
from __future__ import annotations
import abc
import io
import logging
import re
from collections.abc import Iterator
import httpx
from app.connectors.base import RawFingerprint, RawPlaybackSource, RawScene
from app.connectors.direct_scrapers.base import BaseDirectTubeScraper
from app.extractors import browser_get
log = logging.getLogger(__name__)
class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
"""Subclass dostarcza listing/detail parsing. Base flow:
1. for page in 1..max_pages:
2. GET listing_url(page)
3. extract scene URLs
4. for each URL:
5. GET scene detail page
6. parse → RawScene with rich metadata
7. yield
"""
_timeout: float = 30.0
"""HTTP timeout per request."""
@abc.abstractmethod
def _listing_url(self, page: int) -> str:
"""URL listing page'a 'latest-vids' (page 1 = newest)."""
@abc.abstractmethod
def _extract_scene_urls(self, listing_html: str) -> list[str]:
"""Lista absolutnych URL-i scen z listing HTML, w kolejności od najnowszej."""
@abc.abstractmethod
def _parse_detail(self, scene_url: str, detail_html: str) -> RawScene | None:
"""Parsuj scene detail HTML → RawScene z metadata.
Zwraca None gdy scena niedostępna / parse fail — caller pominie ten URL,
nie aborti całe browse."""
def latest_scenes(self, *, max_pages: int = 5) -> Iterator[RawScene]:
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno
~20 scen/page). Schedulowane co 6h (4×/dobę) → catch-up po przerwie.
Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
same N scen pojawia się przez kilka dni.
"""
# search() nie jest implementowany przez subclass dla browse-only tube'ów —
# `BaseDirectTubeScraper.search` to abstrakt, więc dodajemy stub żeby
# przepuścić abc, ale faktyczna ścieżka pracy idzie przez latest_scenes().
for page in range(1, max_pages + 1):
url = self._listing_url(page)
try:
res = browser_get(url, timeout=self._timeout)
html = res.text if hasattr(res, "text") else res
except Exception as e:
log.warning("%s browse listing fetch failed (page %d): %s", self.sitetag, page, e)
break
urls = self._extract_scene_urls(html)
if not urls:
log.info("%s browse: empty listing page %d, stopping", self.sitetag, page)
break
log.info("%s browse page %d: %d scene URLs", self.sitetag, page, len(urls))
for scene_url in urls:
try:
res = browser_get(scene_url, timeout=self._timeout)
detail_html = res.text if hasattr(res, "text") else res
except Exception as e:
log.info("%s detail fetch failed %s: %s", self.sitetag, scene_url, e)
continue
try:
raw = self._parse_detail(scene_url, detail_html)
except Exception as e:
log.warning("%s detail parse failed %s: %s", self.sitetag, scene_url, e)
continue
if raw is not None:
yield raw
# Stub `search()` — BaseDirectTubeScraper wymaga implementacji. Dla browse-only
# tubes nie supportujemy performer-driven search; zwracamy pusty iterator. Tube'y
# które chcą *oba* tryby mogą override'ować search() osobno.
def search(
self,
query: str,
*,
page: int = 1,
limit: int | None = None,
) -> Iterator[RawScene]:
return iter(())
_META_RE_CACHE: dict[str, re.Pattern[str]] = {}
_PHASH_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
def compute_thumbnail_phash(thumbnail_url: str, *, referer: str | None = None, timeout: float = 15.0) -> str | None:
"""Download thumbnail + return 64-bit perceptual hash (16-char hex) lub None.
Format pasuje do `SceneFingerprint.value` w DB (TPDB/StashDB importują ten sam
8x8 phash). Resolver Path 3 `find_by_phash_within` matchuje Hamming ≤5 (default).
Wymaga lazy importu `imagehash`/`PIL` — żeby moduł browse_base importował się
nawet gdy te lib-y są niedostępne (graceful degradation: phash=None → resolver
spadnie do composite scoring, jak gdyby fingerprintu nie było).
"""
try:
from PIL import Image
import imagehash
except ImportError:
log.warning("imagehash/Pillow nie zainstalowane — phash skipped")
return None
headers = {"User-Agent": _PHASH_UA}
if referer:
headers["Referer"] = referer
try:
with httpx.Client(timeout=timeout, follow_redirects=True) as c:
r = c.get(thumbnail_url, headers=headers)
if r.status_code != 200 or not r.content:
return None
img = Image.open(io.BytesIO(r.content))
# phash domyślnie hash_size=8 → 64-bit hash → 16 hex chars. Mode 'L' (greyscale)
# robi to wewnętrznie, ale niektóre webp/animated mogą mieć multi-frame —
# convert() bierze pierwszą klatkę, którą imagehash i tak zredukuje do grey.
return str(imagehash.phash(img.convert("RGB")))
except Exception as e:
log.info("phash compute failed for %s: %s", thumbnail_url, e)
return None
def meta_content(html: str, *, property: str | None = None, name: str | None = None) -> str | None:
"""Wyciąga zawartość <meta property=X content=Y> lub <meta name=X content=Y>.
Standardowy helper dla scraperów które używają OpenGraph / ya:ovs / itp.
Cache compiled regex w module-scope dict (te same selectory powtarzają się).
NB: separate patterns dla `"..."` i `'...'` content quote — wcześniej jeden
`[^"\']*` regex tnął title po wewnętrznym apostrofie (np. `<meta content="She's So Insatiable">`
→ `She`, bug-report 2026-05-20). Teraz matchujemy dokładnie ten sam quote co opening.
"""
key = f"prop:{property}" if property else f"name:{name}"
if key not in _META_RE_CACHE:
attr = "property" if property else "name"
val = re.escape(property or name or "")
# double-quoted content (HTML standard) — preferred
# Pattern: <meta property="X" content="...inner..." > — inner allows apostrophes
_META_RE_CACHE[key] = re.compile(
rf'<meta[^>]+{attr}=["\']{val}["\'][^>]*?content="([^"]*)"'
rf'|<meta[^>]+{attr}=["\']{val}["\'][^>]*?content=\'([^\']*)\'',
re.IGNORECASE,
)
m = _META_RE_CACHE[key].search(html)
if not m:
return None
val = m.group(1) if m.group(1) is not None else m.group(2)
return val.strip() if val else None