Goon — self-hosted aggregator for adult-content scene metadata. Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites. Cross-source deduplication via perceptual hash + Levenshtein distance. FastAPI backend + APScheduler worker + React Native (Expo) mobile client. FOSS, ad-free, donation-funded. See README for details.
141 lines
6 KiB
Python
141 lines
6 KiB
Python
"""Studio extraction z tube tytułów typu aggregator-WordPress.
|
||
|
||
Format obserwowany w istniejących scenach (zaingestowanych przez porn-app legacy):
|
||
- `[StudioCamelCase] Performer1, Performer2 (Title)` — porndish, xmoviesforyou
|
||
- `[StudioCamelCase] Performer (Title / date)` — porndish, xmoviesforyou z datą
|
||
- `Studio – Performer1 – Title` — watchporn, hdporn92 (en/em-dash separator)
|
||
- `Studio – Performer1 & Performer2 – Title – S24:E3` — watchporn series
|
||
|
||
Powód: te tube'y aggregator-WordPress mają `studio_name` ustawione na nazwę source'a
|
||
(`PornDish`, `Watch.Porn`) zamiast prawdziwego studio (`OpenFamily`, `TouchMyWife`).
|
||
Resolver path 4 composite scoring blokuje po `studio_id + release_date` — z błędnym
|
||
studio_id znajduje 0 kandydatów → fallback na performer-only blocking → bez strong
|
||
signal (duration/fp/date) score capuje 0.85 < 0.92 threshold → orphan.
|
||
|
||
Po retro-fix studio_id na prawdziwe canonical studio, path 4 znajduje kandydatów
|
||
i performer + duration + title fuzzy może auto-merge.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
from datetime import date
|
||
|
||
# Bracket format: `[Studio] Performers (Title)` lub `[Studio] Performers (Title / Date)`.
|
||
# Studio musi być przynajmniej 2 chars, no whitespace. Tolerujemy hyphen w studio
|
||
# (np. `[Passion-HD]`, `[PervMom-Squirts]`).
|
||
_BRACKET_RE = re.compile(
|
||
r'^\[(?P<studio>[A-Za-z0-9][A-Za-z0-9 \-\.]{1,40})\]\s+(?P<rest>.+?)$'
|
||
)
|
||
|
||
# Em/en/regular dash format: `Studio – Performers – Title [– Episode]`. Studio
|
||
# musi być max 35 chars + zaczyna alfanumerycznie. Pierwszy segment przed pierwszym
|
||
# dashem to studio. Dash separator może być różny: en-dash (–), em-dash (—),
|
||
# regular hyphen z spacjami `– ` / `- `. Wymagamy że studio jest co najmniej 3 znaki
|
||
# żeby uniknąć `S24:E3` style false-positive.
|
||
_DASH_RE = re.compile(
|
||
r'^(?P<studio>[A-Za-z][A-Za-z0-9 \-\.]{2,35}?)\s+[–—\-]+\s+(?P<rest>.+?)$'
|
||
)
|
||
|
||
# Tytuły które są TYLKO slug-concat (lowercase, brak struktury) — nie parsujemy.
|
||
_LOWERCASE_RE = re.compile(r'^[a-z0-9 ]+$')
|
||
|
||
|
||
@dataclass
|
||
class ParsedTitle:
|
||
studio: str | None # raw studio name as extracted (e.g. "OpenFamily")
|
||
title_remainder: str # rest of title after studio extracted
|
||
format: str # 'bracket' | 'dash' | 'none'
|
||
release_date: date | None = None # parsed z `(... / MM.DD.YYYY)` lub `MM/DD/YYYY`
|
||
clean_title: str | None = None # tytuł sceny bez studio + bez date suffix
|
||
|
||
|
||
# Date patterns w `[Studio] X (Title / MM.DD.YYYY)` lub similar.
|
||
# Porndish & xmoviesforyou używają `MM.DD.YYYY` (american). Czasem widać też
|
||
# `MM/DD/YYYY`. Hyphen w roku nigdy.
|
||
_DATE_RE = re.compile(
|
||
r'(?P<m>0[1-9]|1[0-2])[\./](?P<d>0[1-9]|[12][0-9]|3[01])[\./](?P<y>20\d{2})'
|
||
)
|
||
|
||
|
||
def _parse_date_from_tail(remainder: str) -> tuple[date | None, str]:
|
||
"""Wyciągnij datę z `(Title / MM.DD.YYYY)` lub `(Title / MM/DD/YYYY)`.
|
||
|
||
Returns (parsed_date | None, remainder_without_date_suffix).
|
||
"""
|
||
m = _DATE_RE.search(remainder)
|
||
if not m:
|
||
return None, remainder
|
||
try:
|
||
d = date(int(m.group("y")), int(m.group("m")), int(m.group("d")))
|
||
except (ValueError, TypeError):
|
||
return None, remainder
|
||
# Drop everything from `/` lub `(` lub ` MM.DD` boundary up to date
|
||
cut_pos = m.start()
|
||
# Walk back through separators ` / ` lub `(` before date
|
||
while cut_pos > 0 and remainder[cut_pos - 1] in " /([":
|
||
cut_pos -= 1
|
||
cleaned = (remainder[:cut_pos] + remainder[m.end():]).strip(" ()/-,")
|
||
return d, cleaned
|
||
|
||
|
||
def parse_title(title: str) -> ParsedTitle:
|
||
"""Wyparsuj studio z tytułu sceny.
|
||
|
||
Returns ParsedTitle(studio=None) gdy nie wykryto formatu — wtedy caller powinien
|
||
pominąć fix dla tej sceny.
|
||
"""
|
||
if not title:
|
||
return ParsedTitle(studio=None, title_remainder=title or "", format="none")
|
||
title = title.strip()
|
||
|
||
# Lower-only slug-concat — bez sensu próbować
|
||
if _LOWERCASE_RE.match(title):
|
||
return ParsedTitle(studio=None, title_remainder=title, format="none")
|
||
|
||
m = _BRACKET_RE.match(title)
|
||
if m:
|
||
studio = m.group("studio").strip()
|
||
rest = m.group("rest").strip()
|
||
rel_date, cleaned_rest = _parse_date_from_tail(rest)
|
||
# Extract inner title z `Performers (Title)` — bierzemy tylko zawartość parens
|
||
# jeśli istnieją; inaczej całą reszte
|
||
inner = re.search(r'\(([^()]+)\)\s*$', cleaned_rest)
|
||
clean_title = inner.group(1).strip() if inner else cleaned_rest
|
||
return ParsedTitle(
|
||
studio=studio,
|
||
title_remainder=rest,
|
||
format="bracket",
|
||
release_date=rel_date,
|
||
clean_title=clean_title or None,
|
||
)
|
||
|
||
m = _DASH_RE.match(title)
|
||
if m:
|
||
studio = m.group("studio").strip()
|
||
# Filter: studio nie może być znanym non-studio prefixem (e.g. "NEW", "HD", "VR").
|
||
# Te są częste w SEO tytułach i tworzyłyby false-positive studio_id.
|
||
if studio.lower() in _NON_STUDIO_PREFIXES:
|
||
return ParsedTitle(studio=None, title_remainder=title, format="none")
|
||
rest = m.group("rest").strip()
|
||
# Dash format: `Studio – Performers – Title – [Optional Episode]`. Tytuł sceny
|
||
# to OSTATNI segment (po ostatnim dashu); performerzy to pre-last segments.
|
||
parts = re.split(r'\s+[–—\-]+\s+', rest)
|
||
clean_title = parts[-1] if parts else rest
|
||
return ParsedTitle(
|
||
studio=studio,
|
||
title_remainder=rest,
|
||
format="dash",
|
||
clean_title=clean_title,
|
||
)
|
||
|
||
return ParsedTitle(studio=None, title_remainder=title, format="none")
|
||
|
||
|
||
# Słowa które pojawiają się jako pierwszy token tytułu ale NIE są studio names —
|
||
# typowo SEO booster prefix lub jakość/kategoria/etykieta.
|
||
_NON_STUDIO_PREFIXES = frozenset({
|
||
"new", "hd", "4k", "vr", "free", "watch", "porn", "video", "full",
|
||
"anal", "best", "exclusive", "amateur", "homemade", "pov", "milf", "teen",
|
||
"bbw", "bdsm", "interracial", "lesbian", "threesome", "gangbang",
|
||
})
|