- paradisehill.fetch_movies compared release_date coerced to midnight against the `since` timestamp, so the chronological crawl stopped at the first upload dated the same calendar day as `since` and silently dropped most new movies (0-2 seen per run; Movies tab stalled). Compare by DATE with a 1-day grace instead; idempotent external_records upsert dedups the re-fetched recent window. - scripts/backfill_paradisehill_movies.py: one-off no-delta deep crawl to recover the backlog missed during the bug (idempotent, resumable). - docs: correct stale 'raz dziennie/24h' browse-latest comments to 6h (4x/day), the actual configured cadence (config.py sched_browse_latest_hours=6). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
70 lines
2.6 KiB
Python
70 lines
2.6 KiB
Python
"""Browse-latest strategy — scrap newest scenes from rich-metadata tubes.
|
||
|
||
Pattern (vs performer-driven):
|
||
- performer-driven: backfill scen dla znanych performerów (TPDB/StashDB → tube).
|
||
Wymaga już znanego performera w DB.
|
||
- browse-latest: forward-fill świeżymi scenami z tube'ów (~100 najnowszych /
|
||
tube / dzień). Łapie sceny których performer może być new dla nas — później
|
||
canonical ingest dorobi metadata.
|
||
|
||
Każdy `BaseBrowseScraper.latest_scenes(max_pages=5)` yielduje RawScene z bogatą
|
||
metadata (studio + performers + duration + tags + description). Composite fuzzy
|
||
w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes
|
||
typu pornditt, gdzie był sam title + krótki opis).
|
||
|
||
Schedulowane przez `jobs.py` co 6h / 4×dobę (`sched_browse_latest_hours=6`).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import time
|
||
from dataclasses import dataclass
|
||
|
||
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
||
from app.models.source import SourceKind
|
||
from app.scheduler.performer_driven import _ingest_iter_into_run
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class BrowseCounters:
|
||
scrapers_run: int = 0
|
||
total_seen: int = 0
|
||
total_new: int = 0
|
||
total_updated: int = 0
|
||
total_errors: int = 0
|
||
|
||
|
||
def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters:
|
||
"""Iteruj wszystkie zarejestrowane browse scrapers, scrap latest N pages każdy."""
|
||
counters = BrowseCounters()
|
||
for scraper_cls in ALL_BROWSE_SCRAPERS:
|
||
scraper = scraper_cls()
|
||
t0 = time.time()
|
||
log.info("browse-latest: %s starting (max_pages=%d)", scraper.sitetag, max_pages)
|
||
try:
|
||
c = _ingest_iter_into_run(
|
||
source_kind=SourceKind.scraper,
|
||
source_name="pornapp",
|
||
run_label=f"browse-latest:{scraper.sitetag}",
|
||
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
|
||
max_pages=mp
|
||
),
|
||
)
|
||
counters.scrapers_run += 1
|
||
counters.total_seen += c.get("seen", 0)
|
||
counters.total_new += c.get("new", 0)
|
||
counters.total_updated += c.get("updated", 0)
|
||
counters.total_errors += c.get("errors", 0)
|
||
elapsed = time.time() - t0
|
||
log.info(
|
||
"browse-latest: %s done in %.1fs counters=%s",
|
||
scraper.sitetag, elapsed, c,
|
||
)
|
||
except Exception as e:
|
||
counters.total_errors += 1
|
||
log.exception("browse-latest: %s failed: %s", scraper.sitetag, e)
|
||
|
||
log.info("browse-latest done: %s", counters)
|
||
return counters
|