bazarr/libs/subzero/modification/mods/hearing_impaired.py

# coding=utf-8
from __future__ import absolute_import
from __future__ import unicode_literals
import re

from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, TAG
from subzero.modification.exc import EmptyEntryError
from subzero.modification.processors.re_processor import NReProcessor
from subzero.modification import registry


class FullBracketEntryProcessor(NReProcessor):
    def process(self, content, debug=False, **kwargs):
        entry = kwargs.get("entry")
        if entry:
            rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs)
            if not rep_content.strip():
                raise EmptyEntryError()
        return content


class HearingImpaired(SubtitleTextModification):
    identifier = "remove_HI"
    description = "Remove Hearing Impaired tags"
    exclusive = True
    order = 20

    long_description = "Removes tags, text and characters from subtitles that are meant for hearing impaired people"

    processors = [
        # full bracket entry, single or multiline; starting with brackets and ending with brackets
        FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
                                  "", name="HI_brackets_full"),

        # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
        # possibly with a dash in front; ignore anything ending with a quote
        NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
                                r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
                     name="HI_before_colon_caps"),

        # any text before colon (at least 3 chars); at start or after a sentence,
        # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
        # a space is inside the text; ignore anything ending with a quote
        NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
                                r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9]|//)'),
                     lambda match:
                     match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
                     else "" if not match.group(1).startswith(" ") else " ",
                     name="HI_before_colon_noncaps"),

        # brackets (only remove if at least 3 chars in brackets)
        NReProcessor(re.compile(r'(?sux)-?%(t)s["\']*[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\']*[\s:]*%(t)s' %
                                {"t": TAG}), "", name="HI_brackets"),

        #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
        #             "", name="HI_bracket_open_start"),

        #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
        #             name="HI_bracket_open_end"),

        # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
        # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),

        # starting text before colon (at least 3 chars)
        #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
        #             name="HI_before_colon"),


        # text in brackets at start, after optional dash, before colon or at end of line
        # fixme: may be too aggressive
        #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
        #             name="HI_brackets_special"),

        # all caps line (at least 4 consecutive uppercase chars)
        NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
                     supported=lambda p: not p.only_uppercase),

        # remove MAN:
        NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),

        # dash in front
        # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),

        # all caps at start before new sentence
        NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
                     name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
    ]

    post_processors = empty_line_post_processors
    last_processors = [
        # remove music symbols
        NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
                     "", name="HI_music_symbols_only"),

        # remove music entries
        NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[*#¶♫♪]+\s*.+|.+\s*[*#¶♫♪]+\s*$)'),
                     "", name="HI_music", entry=True),
    ]


registry.register(HearingImpaired)
update deps 6 years ago			`# coding=utf-8`
WIP 5 years ago			`from __future__ import absolute_import`
WIP 5 years ago			`from __future__ import unicode_literals`
update deps 6 years ago			`import re`

merge from Sub-Zero.bundle merge some fixes from panal/Sub-Zero.bundle 4 years ago			`from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, TAG`
			`from subzero.modification.exc import EmptyEntryError`
update deps 6 years ago			`from subzero.modification.processors.re_processor import NReProcessor`
			`from subzero.modification import registry`


			`class FullBracketEntryProcessor(NReProcessor):`
			`def process(self, content, debug=False, **kwargs):`
			`entry = kwargs.get("entry")`
			`if entry:`
			`rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs)`
			`if not rep_content.strip():`
			`raise EmptyEntryError()`
			`return content`


			`class HearingImpaired(SubtitleTextModification):`
			`identifier = "remove_HI"`
			`description = "Remove Hearing Impaired tags"`
			`exclusive = True`
			`order = 20`

			`long_description = "Removes tags, text and characters from subtitles that are meant for hearing impaired people"`

			`processors = [`
			`# full bracket entry, single or multiline; starting with brackets and ending with brackets`
WIP 5 years ago			`FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),`
update deps 6 years ago			`"", name="HI_brackets_full"),`

core: update subliminal_patch to 2.6.4.2917-dev; fix addic7ed, subscene, titlovi; fix SSAStyle parsing in SRT 6 years ago			`# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,`
			`# possibly with a dash in front; ignore anything ending with a quote`
WIP 5 years ago			`NReProcessor(re.compile(r'(?u)(?:(?<=^)\|(?<=[.\-!?\"\']))([\s\->~](?=[A-ZÀ-Ž&+]\s[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'`
			`r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+\|$))(?![0-9])'), "",`
core: update subliminal_patch to 2.6.4.2917-dev; fix addic7ed, subscene, titlovi; fix SSAStyle parsing in SRT 6 years ago			`name="HI_before_colon_caps"),`

			`# any text before colon (at least 3 chars); at start or after a sentence,`
			`# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if`
			`# a space is inside the text; ignore anything ending with a quote`
WIP 5 years ago			`NReProcessor(re.compile(r'(?u)(?:(?<=^)\|(?<=[.\-!?\"]))([\s\->~]((?=[A-zÀ-ž&+]\s[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'`
added tld library, fix add space after dot in domain names - added tld library, so "Common Fixes" mod can detect domain names and won't add spaces after each dot in them. - fix HI_before_colon_noncaps, so it won't remove http: from URLs. 4 years ago			`r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9]\|//)'),`
core: update subliminal_patch to 2.6.4.2917-dev; fix addic7ed, subscene, titlovi; fix SSAStyle parsing in SRT 6 years ago			`lambda match:`
			`match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)`
			`else "" if not match.group(1).startswith(" ") else " ",`
			`name="HI_before_colon_noncaps"),`

update deps 6 years ago			`# brackets (only remove if at least 3 chars in brackets)`
added subzero mods support 4 years ago			`NReProcessor(re.compile(r'(?sux)-?%(t)s["\'][([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\'][\s:]*%(t)s' %`
update deps 6 years ago			`{"t": TAG}), "", name="HI_brackets"),`

WIP 5 years ago			`#NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),`
core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`# "", name="HI_bracket_open_start"),`
update deps 6 years ago
WIP 5 years ago			`#NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",`
core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`# name="HI_bracket_open_end"),`
update deps 6 years ago
			`# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)`
			`# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),`

			`# starting text before colon (at least 3 chars)`
WIP 5 years ago			`#NReProcessor(re.compile(r'(?u)(\b\|^)([\s-](?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s)'), "",`
update deps 6 years ago			`# name="HI_before_colon"),`


			`# text in brackets at start, after optional dash, before colon or at end of line`
			`# fixme: may be too aggressive`
WIP 5 years ago			`#NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)\|:\s*))'), "",`
update deps 6 years ago			`# name="HI_brackets_special"),`

			`# all caps line (at least 4 consecutive uppercase chars)`
WIP 5 years ago			`NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",`
update deps 6 years ago			`supported=lambda p: not p.only_uppercase),`

			`# remove MAN:`
WIP 5 years ago			`NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),`
update deps 6 years ago
			`# dash in front`
			`# NReProcessor(re.compile(r'(?u)^\s-\s'), "", name="HI_starting_dash"),`

			`# all caps at start before new sentence`
WIP 5 years ago			`NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",`
update deps 6 years ago			`name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),`
core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`]`
update deps 6 years ago
core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`post_processors = empty_line_post_processors`
			`last_processors = [`
update deps 6 years ago			`# remove music symbols`
WIP 5 years ago			`NReProcessor(re.compile(r'(?u)(^%(t)s[#¶♫♪\s]%(t)s[#¶♫♪\s]+%(t)s[#¶♫♪\s]*%(t)s$)' % {"t": TAG}),`
update deps 6 years ago			`"", name="HI_music_symbols_only"),`

core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`# remove music entries`
added subzero mods support 4 years ago			`NReProcessor(re.compile(r'(?ums)(^[-\s>~][#¶♫♪]+\s.+\|.+\s[#¶♫♪]+\s$)'),`
merge from Sub-Zero.bundle merge some fixes from panal/Sub-Zero.bundle 4 years ago			`"", name="HI_music", entry=True),`
core: update to subliminal_patch:head; support file hashes even when scenename is used 6 years ago			`]`
update deps 6 years ago

			`registry.register(HearingImpaired)`