# coding=utf-8 from __future__ import absolute_import from __future__ import unicode_literals import re from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, TAG from subzero.modification.exc import EmptyEntryError from subzero.modification.processors.re_processor import NReProcessor from subzero.modification import registry class FullBracketEntryProcessor(NReProcessor): def process(self, content, debug=False, **kwargs): entry = kwargs.get("entry") if entry: rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs) if not rep_content.strip(): raise EmptyEntryError() return content class HearingImpaired(SubtitleTextModification): identifier = "remove_HI" description = "Remove Hearing Impaired tags" exclusive = True order = 20 long_description = "Removes tags, text and characters from subtitles that are meant for hearing impaired people" processors = [ # full bracket entry, single or multiline; starting with brackets and ending with brackets FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), "", name="HI_brackets_full"), # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, # possibly with a dash in front; ignore anything ending with a quote NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", name="HI_before_colon_caps"), # any text before colon (at least 3 chars); at start or after a sentence, # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if # a space is inside the text; ignore anything ending with a quote NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9]|//)'), lambda match: match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) else "" if not match.group(1).startswith(" ") else " ", name="HI_before_colon_noncaps"), # brackets (only remove if at least 3 chars in brackets) NReProcessor(re.compile(r'(?sux)-?%(t)s["\']*[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\']*[\s:]*%(t)s' % {"t": TAG}), "", name="HI_brackets"), #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), # "", name="HI_bracket_open_start"), #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", # name="HI_bracket_open_end"), # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any) # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"), # starting text before colon (at least 3 chars) #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", # name="HI_before_colon"), # text in brackets at start, after optional dash, before colon or at end of line # fixme: may be too aggressive #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", # name="HI_brackets_special"), # all caps line (at least 4 consecutive uppercase chars) NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", supported=lambda p: not p.only_uppercase), # remove MAN: NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), # dash in front # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"), # all caps at start before new sentence NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase), ] post_processors = empty_line_post_processors last_processors = [ # remove music symbols NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), "", name="HI_music_symbols_only"), # remove music entries NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[*#¶♫♪]+\s*.+|.+\s*[*#¶♫♪]+\s*$|.+\s*[*#¶♫♪]+[)\]])'), "", name="HI_music", entry=True), ] registry.register(HearingImpaired)