goon/app/resolve/studio_title_parser.py
goon-foss ad0284585b Initial commit
Goon — self-hosted aggregator for adult-content scene metadata.

Indexes scenes from TPDB, StashDB, and 30+ public adult tube sites.
Cross-source deduplication via perceptual hash + Levenshtein distance.
FastAPI backend + APScheduler worker + React Native (Expo) mobile client.

FOSS, ad-free, donation-funded. See README for details.
2026-05-20 10:10:22 +02:00

141 lines
6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Studio extraction z tube tytułów typu aggregator-WordPress.
Format obserwowany w istniejących scenach (zaingestowanych przez porn-app legacy):
- `[StudioCamelCase] Performer1, Performer2 (Title)` — porndish, xmoviesforyou
- `[StudioCamelCase] Performer (Title / date)` — porndish, xmoviesforyou z datą
- `Studio Performer1 Title` — watchporn, hdporn92 (en/em-dash separator)
- `Studio Performer1 & Performer2 Title S24:E3` — watchporn series
Powód: te tube'y aggregator-WordPress mają `studio_name` ustawione na nazwę source'a
(`PornDish`, `Watch.Porn`) zamiast prawdziwego studio (`OpenFamily`, `TouchMyWife`).
Resolver path 4 composite scoring blokuje po `studio_id + release_date` — z błędnym
studio_id znajduje 0 kandydatów → fallback na performer-only blocking → bez strong
signal (duration/fp/date) score capuje 0.85 < 0.92 threshold → orphan.
Po retro-fix studio_id na prawdziwe canonical studio, path 4 znajduje kandydatów
i performer + duration + title fuzzy może auto-merge.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import date
# Bracket format: `[Studio] Performers (Title)` lub `[Studio] Performers (Title / Date)`.
# Studio musi być przynajmniej 2 chars, no whitespace. Tolerujemy hyphen w studio
# (np. `[Passion-HD]`, `[PervMom-Squirts]`).
_BRACKET_RE = re.compile(
r'^\[(?P<studio>[A-Za-z0-9][A-Za-z0-9 \-\.]{1,40})\]\s+(?P<rest>.+?)$'
)
# Em/en/regular dash format: `Studio Performers Title [ Episode]`. Studio
# musi być max 35 chars + zaczyna alfanumerycznie. Pierwszy segment przed pierwszym
# dashem to studio. Dash separator może być różny: en-dash (), em-dash (—),
# regular hyphen z spacjami ` ` / `- `. Wymagamy że studio jest co najmniej 3 znaki
# żeby uniknąć `S24:E3` style false-positive.
_DASH_RE = re.compile(
r'^(?P<studio>[A-Za-z][A-Za-z0-9 \-\.]{2,35}?)\s+[–—\-]+\s+(?P<rest>.+?)$'
)
# Tytuły które są TYLKO slug-concat (lowercase, brak struktury) — nie parsujemy.
_LOWERCASE_RE = re.compile(r'^[a-z0-9 ]+$')
@dataclass
class ParsedTitle:
studio: str | None # raw studio name as extracted (e.g. "OpenFamily")
title_remainder: str # rest of title after studio extracted
format: str # 'bracket' | 'dash' | 'none'
release_date: date | None = None # parsed z `(... / MM.DD.YYYY)` lub `MM/DD/YYYY`
clean_title: str | None = None # tytuł sceny bez studio + bez date suffix
# Date patterns w `[Studio] X (Title / MM.DD.YYYY)` lub similar.
# Porndish & xmoviesforyou używają `MM.DD.YYYY` (american). Czasem widać też
# `MM/DD/YYYY`. Hyphen w roku nigdy.
_DATE_RE = re.compile(
r'(?P<m>0[1-9]|1[0-2])[\./](?P<d>0[1-9]|[12][0-9]|3[01])[\./](?P<y>20\d{2})'
)
def _parse_date_from_tail(remainder: str) -> tuple[date | None, str]:
"""Wyciągnij datę z `(Title / MM.DD.YYYY)` lub `(Title / MM/DD/YYYY)`.
Returns (parsed_date | None, remainder_without_date_suffix).
"""
m = _DATE_RE.search(remainder)
if not m:
return None, remainder
try:
d = date(int(m.group("y")), int(m.group("m")), int(m.group("d")))
except (ValueError, TypeError):
return None, remainder
# Drop everything from `/` lub `(` lub ` MM.DD` boundary up to date
cut_pos = m.start()
# Walk back through separators ` / ` lub `(` before date
while cut_pos > 0 and remainder[cut_pos - 1] in " /([":
cut_pos -= 1
cleaned = (remainder[:cut_pos] + remainder[m.end():]).strip(" ()/-,")
return d, cleaned
def parse_title(title: str) -> ParsedTitle:
"""Wyparsuj studio z tytułu sceny.
Returns ParsedTitle(studio=None) gdy nie wykryto formatu — wtedy caller powinien
pominąć fix dla tej sceny.
"""
if not title:
return ParsedTitle(studio=None, title_remainder=title or "", format="none")
title = title.strip()
# Lower-only slug-concat — bez sensu próbować
if _LOWERCASE_RE.match(title):
return ParsedTitle(studio=None, title_remainder=title, format="none")
m = _BRACKET_RE.match(title)
if m:
studio = m.group("studio").strip()
rest = m.group("rest").strip()
rel_date, cleaned_rest = _parse_date_from_tail(rest)
# Extract inner title z `Performers (Title)` — bierzemy tylko zawartość parens
# jeśli istnieją; inaczej całą reszte
inner = re.search(r'\(([^()]+)\)\s*$', cleaned_rest)
clean_title = inner.group(1).strip() if inner else cleaned_rest
return ParsedTitle(
studio=studio,
title_remainder=rest,
format="bracket",
release_date=rel_date,
clean_title=clean_title or None,
)
m = _DASH_RE.match(title)
if m:
studio = m.group("studio").strip()
# Filter: studio nie może być znanym non-studio prefixem (e.g. "NEW", "HD", "VR").
# Te są częste w SEO tytułach i tworzyłyby false-positive studio_id.
if studio.lower() in _NON_STUDIO_PREFIXES:
return ParsedTitle(studio=None, title_remainder=title, format="none")
rest = m.group("rest").strip()
# Dash format: `Studio Performers Title [Optional Episode]`. Tytuł sceny
# to OSTATNI segment (po ostatnim dashu); performerzy to pre-last segments.
parts = re.split(r'\s+[–—\-]+\s+', rest)
clean_title = parts[-1] if parts else rest
return ParsedTitle(
studio=studio,
title_remainder=rest,
format="dash",
clean_title=clean_title,
)
return ParsedTitle(studio=None, title_remainder=title, format="none")
# Słowa które pojawiają się jako pierwszy token tytułu ale NIE są studio names —
# typowo SEO booster prefix lub jakość/kategoria/etykieta.
_NON_STUDIO_PREFIXES = frozenset({
"new", "hd", "4k", "vr", "free", "watch", "porn", "video", "full",
"anal", "best", "exclusive", "amateur", "homemade", "pov", "milf", "teen",
"bbw", "bdsm", "interracial", "lesbian", "threesome", "gangbang",
})