We ingested only ~3% of each browse tube's catalog (porndoe >62k scenes; we had 1959) because tubes were hit only by performer-search + top-N browse. Pilot (porndoe pages 64-110): 1119 new scenes, 100% playable + 100% tagged, 0% canonical overlap (purely additive — content not in TPDB/StashDB). - app/scheduler/deep_crawl.py: round-robin over ALL_BROWSE_SCRAPERS, per-tube page cursor in app/_state/deepcrawl_state.json (no DB migration), deep-paginate from the cursor, idempotent (resolver skips known by raw_hash), mark 'exhausted' at catalog end then reset cursors for an incremental re-sweep. - _job_deep_crawl: hourly, 60 pages/run (~1860 scenes, ~22 min), wrapped in the 1h hard-timeout; registered in build_scheduler (jobs=10). - config: sched_deep_crawl_hours=1, deep_crawl_pages_per_run=60, deepcrawl_state_path. - scripts/pilot_porndoe_deepcrawl.py: one-off pilot used to validate the approach. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
"""Pilot (Faza 1) — deep-crawl porndoe poza najnowsze strony, żeby zmierzyć WARTOŚĆ
|
|
pełnego crawlu tube'a (vs obecne search+top-N). Mamy ~3% katalogu porndoe (1959/62k+).
|
|
|
|
Crawluje strony START..END (domyślnie 64+, czyli ogon którego jeszcze nie mamy),
|
|
przepuszcza przez normalny `_process_scene` (resolver: match canonical / orphan + tagi
|
|
+ duration). Mierzy counters. NIE modyfikuje produkcyjnych jobów — to ad-hoc pomiar.
|
|
|
|
Użycie: python scripts/pilot_porndoe_deepcrawl.py --start 64 --end 110
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
import time
|
|
|
|
from app.connectors.direct_scrapers.porndoe import PornDoeScraper
|
|
from app.db import session_scope
|
|
from app.extractors import browser_get
|
|
from app.ingest import _process_scene, get_or_create_source
|
|
from app.models.source import SourceKind
|
|
|
|
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s")
|
|
log = logging.getLogger("pilot_porndoe")
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--start", type=int, default=64)
|
|
ap.add_argument("--end", type=int, default=110)
|
|
args = ap.parse_args()
|
|
|
|
s = PornDoeScraper()
|
|
with session_scope() as session:
|
|
src = get_or_create_source(session, kind=SourceKind.scraper, name="pornapp")
|
|
source_id = src.id
|
|
|
|
counters = {"seen": 0, "new": 0, "updated": 0, "skipped": 0, "errors": 0}
|
|
t0 = time.time()
|
|
for page in range(args.start, args.end + 1):
|
|
try:
|
|
res = browser_get(s._listing_url(page), timeout=30)
|
|
html = res.text if hasattr(res, "text") else res
|
|
except Exception as e:
|
|
log.warning("listing page %d failed: %s", page, e)
|
|
continue
|
|
urls = s._extract_scene_urls(html)
|
|
if not urls:
|
|
print(f"empty listing page {page}, stop")
|
|
break
|
|
for u in urls:
|
|
try:
|
|
r = browser_get(u, timeout=30)
|
|
dh = r.text if hasattr(r, "text") else r
|
|
raw = s._parse_detail(u, dh)
|
|
except Exception:
|
|
counters["errors"] += 1
|
|
continue
|
|
if raw is None:
|
|
continue
|
|
counters["seen"] += 1
|
|
try:
|
|
_process_scene(source_id=source_id, raw_scene=raw, counters=counters)
|
|
except Exception:
|
|
counters["errors"] += 1
|
|
print(f"page {page}: {counters} ({time.time() - t0:.0f}s)", flush=True)
|
|
|
|
print(f"PILOT DONE pages {args.start}-{args.end}: {counters} elapsed={time.time() - t0:.0f}s")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|