"""Browse-latest strategy — scrap newest scenes from rich-metadata tubes. Pattern (vs performer-driven): - performer-driven: backfill scen dla znanych performerów (TPDB/StashDB → tube). Wymaga już znanego performera w DB. - browse-latest: forward-fill świeżymi scenami z tube'ów (~100 najnowszych / tube / dzień). Łapie sceny których performer może być new dla nas — później canonical ingest dorobi metadata. Każdy `BaseBrowseScraper.latest_scenes(max_pages=5)` yielduje RawScene z bogatą metadata (studio + performers + duration + tags + description). Composite fuzzy w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes typu pornditt, gdzie był sam title + krótki opis). Schedulowane przez `jobs.py` co 6h / 4×dobę (`sched_browse_latest_hours=6`). """ from __future__ import annotations import logging import time from dataclasses import dataclass from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS from app.models.source import SourceKind from app.scheduler.performer_driven import _ingest_iter_into_run log = logging.getLogger(__name__) @dataclass class BrowseCounters: scrapers_run: int = 0 total_seen: int = 0 total_new: int = 0 total_updated: int = 0 total_errors: int = 0 def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters: """Iteruj wszystkie zarejestrowane browse scrapers, scrap latest N pages każdy.""" counters = BrowseCounters() for scraper_cls in ALL_BROWSE_SCRAPERS: scraper = scraper_cls() t0 = time.time() log.info("browse-latest: %s starting (max_pages=%d)", scraper.sitetag, max_pages) try: c = _ingest_iter_into_run( source_kind=SourceKind.scraper, source_name="pornapp", run_label=f"browse-latest:{scraper.sitetag}", iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes( max_pages=mp ), ) counters.scrapers_run += 1 counters.total_seen += c.get("seen", 0) counters.total_new += c.get("new", 0) counters.total_updated += c.get("updated", 0) counters.total_errors += c.get("errors", 0) elapsed = time.time() - t0 log.info( "browse-latest: %s done in %.1fs counters=%s", scraper.sitetag, elapsed, c, ) except Exception as e: counters.total_errors += 1 log.exception("browse-latest: %s failed: %s", scraper.sitetag, e) log.info("browse-latest done: %s", counters) return counters