Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
70 lines
2.6 KiB
Python
70 lines
2.6 KiB
Python
"""Browse-latest strategy — scrap newest scenes from rich-metadata tubes.
|
|
|
|
Pattern (vs performer-driven):
|
|
- performer-driven: backfill scen dla znanych performerów (TPDB/StashDB → tube).
|
|
Wymaga już znanego performera w DB.
|
|
- browse-latest: forward-fill świeżymi scenami z tube'ów (~100 najnowszych /
|
|
tube / dzień). Łapie sceny których performer może być new dla nas — później
|
|
canonical ingest dorobi metadata.
|
|
|
|
Każdy `BaseBrowseScraper.latest_scenes(max_pages=5)` yielduje RawScene z bogatą
|
|
metadata (studio + performers + duration + tags + description). Composite fuzzy
|
|
w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes
|
|
typu pornditt, gdzie był sam title + krótki opis).
|
|
|
|
Schedulowane przez `jobs.py` raz dziennie (`sched_browse_latest_hours=24`).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass
|
|
|
|
from app.connectors.direct_scrapers import ALL_BROWSE_SCRAPERS
|
|
from app.models.source import SourceKind
|
|
from app.scheduler.performer_driven import _ingest_iter_into_run
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class BrowseCounters:
|
|
scrapers_run: int = 0
|
|
total_seen: int = 0
|
|
total_new: int = 0
|
|
total_updated: int = 0
|
|
total_errors: int = 0
|
|
|
|
|
|
def run_browse_latest(*, max_pages: int = 5) -> BrowseCounters:
|
|
"""Iteruj wszystkie zarejestrowane browse scrapers, scrap latest N pages każdy."""
|
|
counters = BrowseCounters()
|
|
for scraper_cls in ALL_BROWSE_SCRAPERS:
|
|
scraper = scraper_cls()
|
|
t0 = time.time()
|
|
log.info("browse-latest: %s starting (max_pages=%d)", scraper.sitetag, max_pages)
|
|
try:
|
|
c = _ingest_iter_into_run(
|
|
source_kind=SourceKind.scraper,
|
|
source_name="pornapp",
|
|
run_label=f"browse-latest:{scraper.sitetag}",
|
|
iterator_factory=lambda s=scraper, mp=max_pages: s.latest_scenes(
|
|
max_pages=mp
|
|
),
|
|
)
|
|
counters.scrapers_run += 1
|
|
counters.total_seen += c.get("seen", 0)
|
|
counters.total_new += c.get("new", 0)
|
|
counters.total_updated += c.get("updated", 0)
|
|
counters.total_errors += c.get("errors", 0)
|
|
elapsed = time.time() - t0
|
|
log.info(
|
|
"browse-latest: %s done in %.1fs counters=%s",
|
|
scraper.sitetag, elapsed, c,
|
|
)
|
|
except Exception as e:
|
|
counters.total_errors += 1
|
|
log.exception("browse-latest: %s failed: %s", scraper.sitetag, e)
|
|
|
|
log.info("browse-latest done: %s", counters)
|
|
return counters
|