goon/app/scheduler/browse_latest.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

70 lines
2.6 KiB
Python

"""Browse-latest strategy — scrap newest scenes from rich-metadata tubes.
Pattern (vs performer-driven):
- performer-driven: backfill scen dla znanych performerów (TPDB/StashDB → tube).
Wymaga już znanego performera w DB.
- browse-latest: forward-fill świeżymi scenami z tube'ów (~100 najnowszych /
tube / dzień). Łapie sceny których performer może być new dla nas — później
canonical ingest dorobi metadata.
Każdy `BaseBrowseScraper.latest_scenes(max_pages=5)` yielduje RawScene z bogatą
metadata (studio + performers + duration + tags + description). Composite fuzzy
w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes
typu pornditt, gdzie był sam title + krótki opis).
Schedulowane przez `jobs.py` raz dziennie (`sched_browse_latest_hours=24`).
"""
from __future__ import annotations
import logging
import time
from dataclasses import dataclass
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
from app.models.source import SourceKind
from app.scheduler.performer_driven import _ingest_iter_into_run
log = logging.getLogger(__name__)
@dataclass
class BrowseCounters:
scrapers_run: int = 0
total_seen: int = 0
total_new: int = 0
total_updated: int = 0
total_errors: int = 0
def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters:
"""Iteruj wszystkie zarejestrowane browse scrapers, scrap latest N pages każdy."""
counters = BrowseCounters()
for scraper_cls in ALL_BROWSE_SCRAPERS:
scraper = scraper_cls()
t0 = time.time()
log.info("browse-latest: %s starting (max_pages=%d)", scraper.sitetag, max_pages)
try:
c = _ingest_iter_into_run(
source_kind=SourceKind.scraper,
source_name="pornapp",
run_label=f"browse-latest:{scraper.sitetag}",
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
max_pages=mp
),
)
counters.scrapers_run += 1
counters.total_seen += c.get("seen", 0)
counters.total_new += c.get("new", 0)
counters.total_updated += c.get("updated", 0)
counters.total_errors += c.get("errors", 0)
elapsed = time.time() - t0
log.info(
"browse-latest: %s done in %.1fs counters=%s",
scraper.sitetag, elapsed, c,
)
except Exception as e:
counters.total_errors += 1
log.exception("browse-latest: %s failed: %s", scraper.sitetag, e)
log.info("browse-latest done: %s", counters)
return counters