fix(movies): paradisehill delta date-granularity + browse cadence docs
- paradisehill.fetch_movies compared release_date coerced to midnight against the `since` timestamp, so the chronological crawl stopped at the first upload dated the same calendar day as `since` and silently dropped most new movies (0-2 seen per run; Movies tab stalled). Compare by DATE with a 1-day grace instead; idempotent external_records upsert dedups the re-fetched recent window. - scripts/backfill_paradisehill_movies.py: one-off no-delta deep crawl to recover the backlog missed during the bug (idempotent, resumable). - docs: correct stale 'raz dziennie/24h' browse-latest comments to 6h (4x/day), the actual configured cadence (config.py sched_browse_latest_hours=6). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
da7fcda132
commit
cd12348782
7 changed files with 64 additions and 21 deletions
|
|
@ -128,7 +128,7 @@ ALL_DIRECT_SCRAPERS: list[type[BaseDirectTubeScraper]] = [
|
||||||
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
# Browse-mode scrapers — iterują `latest-vids` listing zamiast search-by-performer.
|
||||||
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
|
# Phash thumbnail fingerprint (waga 0.40 w composite scoring) auto-mergeuje do
|
||||||
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
|
# canonical (TPDB/StashDB) gdy tube hot-linkuje studio thumbnail. Schedulowane
|
||||||
# raz dziennie, pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
|
# co 6h (4×/dobę), pages 1-5. Patrz `_browse_base.BaseBrowseScraper` +
|
||||||
# `app/scheduler/browse_latest.py`.
|
# `app/scheduler/browse_latest.py`.
|
||||||
#
|
#
|
||||||
# **Pilot results (2026-05-12):**
|
# **Pilot results (2026-05-12):**
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,7 @@ class BaseBrowseScraper(BaseDirectTubeScraper, abc.ABC):
|
||||||
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
|
"""Iteruje sceny od najnowszych: page 1..max_pages × N scen/page.
|
||||||
|
|
||||||
Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno
|
Domyślnie max_pages=5 → ~100 scen per tube per run (shyfap, freshporno
|
||||||
~20 scen/page). Schedulowane raz dziennie → catch-up po 24h przerwie.
|
~20 scen/page). Schedulowane co 6h (4×/dobę) → catch-up po przerwie.
|
||||||
|
|
||||||
Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy
|
Dedup po external_id zachodzi w resolverze (path 1 same_source) — gdy
|
||||||
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
|
scena już była, update last_seen + skip. Więc bezpieczne nawet gdy te
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ from __future__ import annotations
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import UTC, date, datetime
|
from datetime import UTC, date, datetime, timedelta
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
@ -179,15 +179,18 @@ class ParadisehillConnector(BaseMovieConnector):
|
||||||
if movie is None:
|
if movie is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# `since` filter — datePublished poniżej threshold = stop crawla,
|
# `since` filter — datePublished (= data uploadu na paradisehill) poniżej
|
||||||
# bo listing jest chronologiczny. since z `_last_successful_finished_at`
|
# progu = stop crawla (listing chronologiczny).
|
||||||
# jest TZ-aware (UTC); combine() daje naive — przywróć UTC tzinfo żeby
|
#
|
||||||
# porównanie nie crashowało.
|
# UWAGA: release_date to DATA (bez godziny). Wcześniej combine()→00:00
|
||||||
|
# porównywane z TIMESTAMPEM `since` ucinało crawl na PIERWSZYM filmie z dnia
|
||||||
|
# == since (midnight < since o dowolnej porze dnia) → uploady tego samego dnia
|
||||||
|
# systematycznie ginęły, bo movie-ingest jest dzienny (seen=0-2/run mimo
|
||||||
|
# świeżych filmów na froncie strony — bug-report 2026-06-01 "Movies stoją").
|
||||||
|
# Fix: porównuj po DACIE z 1-dniowym grace; ponowny fetch świeżych jest tani
|
||||||
|
# (external_records upsert pomija niezmieniony hash).
|
||||||
if since is not None and movie.release_date is not None:
|
if since is not None and movie.release_date is not None:
|
||||||
rd_dt = datetime.combine(
|
if movie.release_date < (since - timedelta(days=1)).date():
|
||||||
movie.release_date, datetime.min.time(), tzinfo=UTC
|
|
||||||
)
|
|
||||||
if rd_dt < since:
|
|
||||||
log.info(
|
log.info(
|
||||||
"paradisehill: hit since boundary at %s (%s), stop",
|
"paradisehill: hit since boundary at %s (%s), stop",
|
||||||
mid, movie.release_date,
|
mid, movie.release_date,
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ metadata (studio + performers + duration + tags + description). Composite fuzzy
|
||||||
w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes
|
w resolverze ma więc dobre sygnały dla canonical match (vs orphan-only tubes
|
||||||
typu pornditt, gdzie był sam title + krótki opis).
|
typu pornditt, gdzie był sam title + krótki opis).
|
||||||
|
|
||||||
Schedulowane przez `jobs.py` raz dziennie (`sched_browse_latest_hours=24`).
|
Schedulowane przez `jobs.py` co 6h / 4×dobę (`sched_browse_latest_hours=6`).
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -308,10 +308,11 @@ DEFAULT_CONFIG: dict[str, Any] = {
|
||||||
"stashdb_hours": 6,
|
"stashdb_hours": 6,
|
||||||
"performer_driven_hours": 12,
|
"performer_driven_hours": 12,
|
||||||
"performer_driven_top_n": 20,
|
"performer_driven_top_n": 20,
|
||||||
# Browse-latest — newest scenes z rich-metadata tubes (shyfap, ...). Raz dziennie
|
# Browse-latest — newest scenes z rich-metadata tubes. Co 6h (4×/dobę) × ~100
|
||||||
# × ~100 scen/tube/run = drobny budżet, łapie świeże sceny których performera jeszcze
|
# scen/tube/run łapie świeże sceny których performera jeszcze nie znamy (newcomerki
|
||||||
# nie znamy (newcomerki → canonical ingest dorobi potem).
|
# → canonical ingest dorobi potem). NB: ten DEFAULT_CONFIG jest poglądowy — realnie
|
||||||
"browse_latest_hours": 24,
|
# run_forever() bierze interwały z Settings (config.py: sched_browse_latest_hours=6).
|
||||||
|
"browse_latest_hours": 6,
|
||||||
"browse_latest_max_pages": 5,
|
"browse_latest_max_pages": 5,
|
||||||
# Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną
|
# Movies — paradisehill + dooplay mirrory. Raz dziennie wystarczy (sites rosną
|
||||||
# wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback
|
# wolniej niż tube'y). Najwazniejsze: mirrory dorzucają native-friendly playback
|
||||||
|
|
|
||||||
|
|
@ -195,11 +195,11 @@ def run_forever() -> int:
|
||||||
settings, "sched_performer_continuous_refresh_days", 30
|
settings, "sched_performer_continuous_refresh_days", 30
|
||||||
),
|
),
|
||||||
"movie_ingest_hours": getattr(settings, "sched_movie_ingest_hours", 24) or None,
|
"movie_ingest_hours": getattr(settings, "sched_movie_ingest_hours", 24) or None,
|
||||||
# Browse-latest scheduler — freshporno/porn00/pornxp browse newest scenes raz
|
# Browse-latest scheduler — browse newest scenes co 6h / 4×dobę (~100 scen/tube/run).
|
||||||
# dziennie (~100 scen/tube/run). Bug: brak tego klucza w worker config przez
|
# Bug: brak tego klucza w worker config przez ~tydzień powodował że browse-mode nigdy
|
||||||
# ~tydzień powodował że browse-mode nigdy nie odpalał (15k freshporno z 2026-05-13
|
# nie odpalał (15k freshporno z 2026-05-13 to bulk import jednorazowy). Bug-report
|
||||||
# to bulk import jednorazowy). Bug-report 93d3c485 (2026-05-19) "brak freshporno".
|
# 93d3c485 (2026-05-19) "brak freshporno".
|
||||||
"browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 24) or None,
|
"browse_latest_hours": getattr(settings, "sched_browse_latest_hours", 6) or None,
|
||||||
"browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5),
|
"browse_latest_max_pages": getattr(settings, "sched_browse_latest_max_pages", 5),
|
||||||
# Bulk-dedup performers — safety net dla duplikatów które resolver
|
# Bulk-dedup performers — safety net dla duplikatów które resolver
|
||||||
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
|
# pominął (np. freshporno scen przed fixem release_date). Run 12h.
|
||||||
|
|
|
||||||
39
scripts/backfill_paradisehill_movies.py
Normal file
39
scripts/backfill_paradisehill_movies.py
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
"""One-off: głęboki crawl paradisehill (no-delta) do odzyskania backlogu filmów
|
||||||
|
przegapionych w okresie buga delta (release_date DATA vs timestamp `since`, fix
|
||||||
|
2026-06-01). Idempotentny — znane filmy pomija przez external_records hash, więc
|
||||||
|
można puścić wielokrotnie / przerwać i wznowić bez duplikatów.
|
||||||
|
|
||||||
|
Użycie:
|
||||||
|
python scripts/backfill_paradisehill_movies.py --limit 1500
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from app.connectors import get_movie_connectors
|
||||||
|
from app.ingest import ingest_movies_from_connector
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
log = logging.getLogger("backfill_ph_movies")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--limit", type=int, default=1500, help="Ile najnowszych filmów przejrzeć (default 1500)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
reg = dict(get_movie_connectors())
|
||||||
|
connector = reg.get("paradisehill")
|
||||||
|
if connector is None:
|
||||||
|
log.error("paradisehill connector not registered (available: %s)", list(reg))
|
||||||
|
return 1
|
||||||
|
|
||||||
|
counters = ingest_movies_from_connector(connector(), use_delta=False, limit=args.limit)
|
||||||
|
log.info("DONE backfill paradisehill: %s", counters)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
Add table
Reference in a new issue