Improve mods

1. Avoid uppercase after possible abbreviations
2. Avoid removing double punctuation for spanish subtitles
pull/2292/head
Vitiko 6 months ago
parent 386ac22631
commit b36b3782d7

@ -13,6 +13,7 @@ from tld import get_tld
ENGLISH = Language("eng")
SPANISH = (Language("spa"), Language("spa", "MX"))
class CommonFixes(SubtitleTextModification):
@ -105,12 +106,16 @@ class CommonFixes(SubtitleTextModification):
# uppercase after dot
NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()) if len(match.group(1)) > 4 else r"%s%s" % (match.group(1), match.group(2)),
name="CM_uppercase_after_dot"),
# remove double interpunction
NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
name="CM_double_interpunct"),
name="CM_double_interpunct",
# Double interpunction is valid for spanish
# https://www.rae.es/duda-linguistica/es-correcto-combinar-los-signos-de-interrogacion-y-exclamacion
supported=lambda p: p.language not in SPANISH),
# remove spaces before punctuation; don't break spaced ellipses
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),

Loading…
Cancel
Save