#!/usr/bin/env python # -*- coding: utf-8 -*- """ language and subtitle_language properties """ # pylint: disable=no-member import copy from collections import defaultdict, namedtuple import babelfish from rebulk import Rebulk, Rule, RemoveMatch, RenameMatch from rebulk.remodule import re from ..common import seps from ..common.pattern import is_disabled from ..common.words import iter_words from ..common.validators import seps_surround def language(config, common_words): """ Builder for rebulk object. :param config: rule configuration :type config: dict :param common_words: common words :type common_words: set :return: Created Rebulk object :rtype: Rebulk """ subtitle_both = config['subtitle_affixes'] subtitle_prefixes = sorted(subtitle_both + config['subtitle_prefixes'], key=length_comparator) subtitle_suffixes = sorted(subtitle_both + config['subtitle_suffixes'], key=length_comparator) lang_both = config['language_affixes'] lang_prefixes = sorted(lang_both + config['language_prefixes'], key=length_comparator) lang_suffixes = sorted(lang_both + config['language_suffixes'], key=length_comparator) weak_affixes = frozenset(config['weak_affixes']) rebulk = Rebulk(disabled=lambda context: (is_disabled(context, 'language') and is_disabled(context, 'subtitle_language'))) rebulk.string(*subtitle_prefixes, name="subtitle_language.prefix", ignore_case=True, private=True, validator=seps_surround, tags=['release-group-prefix'], disabled=lambda context: is_disabled(context, 'subtitle_language')) rebulk.string(*subtitle_suffixes, name="subtitle_language.suffix", ignore_case=True, private=True, validator=seps_surround, disabled=lambda context: is_disabled(context, 'subtitle_language')) rebulk.string(*lang_suffixes, name="language.suffix", ignore_case=True, private=True, validator=seps_surround, tags=['source-suffix'], disabled=lambda context: is_disabled(context, 'language')) def find_languages(string, context=None): """Find languages in the string :return: list of tuple (property, Language, lang_word, word) """ return LanguageFinder(context, subtitle_prefixes, subtitle_suffixes, lang_prefixes, lang_suffixes, weak_affixes).find(string) rebulk.functional(find_languages, properties={'language': [None]}, disabled=lambda context: not context.get('allowed_languages')) rebulk.rules(SubtitleExtensionRule, SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, RemoveLanguage, RemoveInvalidLanguages(common_words)) babelfish.language_converters['guessit'] = GuessitConverter(config['synonyms']) return rebulk UNDETERMINED = babelfish.Language('und') MULTIPLE = babelfish.Language('mul') NON_SPECIFIC_LANGUAGES = frozenset([UNDETERMINED, MULTIPLE]) class GuessitConverter(babelfish.LanguageReverseConverter): # pylint: disable=missing-docstring _with_country_regexp = re.compile(r'(.*)\((.*)\)') _with_country_regexp2 = re.compile(r'(.*)-(.*)') def __init__(self, synonyms): self.guessit_exceptions = {} for code, synlist in synonyms.items(): if '_' in code: (alpha3, country) = code.split('_') else: (alpha3, country) = (code, None) for syn in synlist: self.guessit_exceptions[syn.lower()] = (alpha3, country, None) @property def codes(self): # pylint: disable=missing-docstring return (babelfish.language_converters['alpha3b'].codes | babelfish.language_converters['alpha2'].codes | babelfish.language_converters['name'].codes | babelfish.language_converters['opensubtitles'].codes | babelfish.country_converters['name'].codes | frozenset(self.guessit_exceptions.keys())) def convert(self, alpha3, country=None, script=None): return str(babelfish.Language(alpha3, country, script)) def reverse(self, name): # pylint:disable=arguments-differ name = name.lower() # exceptions come first, as they need to override a potential match # with any of the other guessers try: return self.guessit_exceptions[name] except KeyError: pass for conv in [babelfish.Language, babelfish.Language.fromalpha3b, babelfish.Language.fromalpha2, babelfish.Language.fromname, babelfish.Language.fromopensubtitles, babelfish.Language.fromietf]: try: reverse = conv(name) return reverse.alpha3, reverse.country, reverse.script except (ValueError, babelfish.LanguageReverseError): pass raise babelfish.LanguageReverseError(name) def length_comparator(value): """ Return value length. """ return len(value) _LanguageMatch = namedtuple('_LanguageMatch', ['property_name', 'word', 'lang']) class LanguageWord(object): """ Extension to the Word namedtuple in order to create compound words. E.g.: pt-BR, soft subtitles, custom subs """ def __init__(self, start, end, value, input_string, next_word=None): self.start = start self.end = end self.value = value self.input_string = input_string self.next_word = next_word @property def extended_word(self): # pylint:disable=inconsistent-return-statements """ Return the extended word for this instance, if any. """ if self.next_word: separator = self.input_string[self.end:self.next_word.start] next_separator = self.input_string[self.next_word.end:self.next_word.end + 1] if (separator == '-' and separator != next_separator) or separator in (' ', '.'): value = self.input_string[self.start:self.next_word.end].replace('.', ' ') return LanguageWord(self.start, self.next_word.end, value, self.input_string, self.next_word.next_word) def __repr__(self): return '<({start},{end}): {value}'.format(start=self.start, end=self.end, value=self.value) def to_rebulk_match(language_match): """ Convert language match to rebulk Match: start, end, dict """ word = language_match.word start = word.start end = word.end name = language_match.property_name if language_match.lang == UNDETERMINED: return start, end, { 'name': name, 'value': word.value.lower(), 'formatter': babelfish.Language, 'tags': ['weak-language'] } return start, end, { 'name': name, 'value': language_match.lang } class LanguageFinder(object): """ Helper class to search and return language matches: 'language' and 'subtitle_language' properties """ def __init__(self, context, subtitle_prefixes, subtitle_suffixes, lang_prefixes, lang_suffixes, weak_affixes): allowed_languages = context.get('allowed_languages') if context else None self.allowed_languages = {l.lower() for l in allowed_languages or []} self.weak_affixes = weak_affixes self.prefixes_map = {} self.suffixes_map = {} if not is_disabled(context, 'subtitle_language'): self.prefixes_map['subtitle_language'] = subtitle_prefixes self.suffixes_map['subtitle_language'] = subtitle_suffixes self.prefixes_map['language'] = lang_prefixes self.suffixes_map['language'] = lang_suffixes def find(self, string): """ Return all matches for language and subtitle_language. Undetermined language matches are removed if a regular language is found. Multi language matches are removed if there are only undetermined language matches """ regular_lang_map = defaultdict(set) undetermined_map = defaultdict(set) multi_map = defaultdict(set) for match in self.iter_language_matches(string): key = match.property_name if match.lang == UNDETERMINED: undetermined_map[key].add(match) elif match.lang == 'mul': multi_map[key].add(match) else: regular_lang_map[key].add(match) for key, values in multi_map.items(): if key in regular_lang_map or key not in undetermined_map: for value in values: yield to_rebulk_match(value) for key, values in undetermined_map.items(): if key not in regular_lang_map: for value in values: yield to_rebulk_match(value) for values in regular_lang_map.values(): for value in values: yield to_rebulk_match(value) def iter_language_matches(self, string): """ Return language matches for the given string. """ candidates = [] previous = None for word in iter_words(string): language_word = LanguageWord(start=word.span[0], end=word.span[1], value=word.value, input_string=string) if previous: previous.next_word = language_word candidates.append(previous) previous = language_word if previous: candidates.append(previous) for candidate in candidates: for match in self.iter_matches_for_candidate(candidate): yield match def iter_matches_for_candidate(self, language_word): """ Return language matches for the given candidate word. """ tuples = [ (language_word, language_word.next_word, self.prefixes_map, lambda string, prefix: string.startswith(prefix), lambda string, prefix: string[len(prefix):]), (language_word.next_word, language_word, self.suffixes_map, lambda string, suffix: string.endswith(suffix), lambda string, suffix: string[:len(string) - len(suffix)]) ] for word, fallback_word, affixes, is_affix, strip_affix in tuples: if not word: continue match = self.find_match_for_word(word, fallback_word, affixes, is_affix, strip_affix) if match: yield match match = self.find_language_match_for_word(language_word) if match: yield match def find_match_for_word(self, word, fallback_word, affixes, is_affix, strip_affix): # pylint:disable=inconsistent-return-statements """ Return the language match for the given word and affixes. """ for current_word in (word.extended_word, word): if not current_word: continue word_lang = current_word.value.lower() for key, parts in affixes.items(): for part in parts: if not is_affix(word_lang, part): continue match = None value = strip_affix(word_lang, part) if not value: if fallback_word and ( abs(fallback_word.start - word.end) <= 1 or abs(word.start - fallback_word.end) <= 1): match = self.find_language_match_for_word(fallback_word, key=key) if not match and part not in self.weak_affixes: match = self.create_language_match(key, LanguageWord(current_word.start, current_word.end, 'und', current_word.input_string)) else: match = self.create_language_match(key, LanguageWord(current_word.start, current_word.end, value, current_word.input_string)) if match: return match def find_language_match_for_word(self, word, key='language'): # pylint:disable=inconsistent-return-statements """ Return the language match for the given word. """ for current_word in (word.extended_word, word): if current_word: match = self.create_language_match(key, current_word) if match: return match def create_language_match(self, key, word): # pylint:disable=inconsistent-return-statements """ Create a LanguageMatch for a given word """ lang = self.parse_language(word.value.lower()) if lang is not None: return _LanguageMatch(property_name=key, word=word, lang=lang) def parse_language(self, lang_word): # pylint:disable=inconsistent-return-statements """ Parse the lang_word into a valid Language. Multi and Undetermined languages are also valid languages. """ try: lang = babelfish.Language.fromguessit(lang_word) if ((hasattr(lang, 'name') and lang.name.lower() in self.allowed_languages) or (hasattr(lang, 'alpha2') and lang.alpha2.lower() in self.allowed_languages) or lang.alpha3.lower() in self.allowed_languages): return lang except babelfish.Error: pass class SubtitlePrefixLanguageRule(Rule): """ Convert language guess as subtitle_language if previous match is a subtitle language prefix """ consequence = RemoveMatch properties = {'subtitle_language': [None]} def enabled(self, context): return not is_disabled(context, 'subtitle_language') def when(self, matches, context): to_rename = [] to_remove = matches.named('subtitle_language.prefix') for lang in matches.named('language'): prefix = matches.previous(lang, lambda match: match.name == 'subtitle_language.prefix', 0) if not prefix: group_marker = matches.markers.at_match(lang, lambda marker: marker.name == 'group', 0) if group_marker: # Find prefix if placed just before the group prefix = matches.previous(group_marker, lambda match: match.name == 'subtitle_language.prefix', 0) if not prefix: # Find prefix if placed before in the group prefix = matches.range(group_marker.start, lang.start, lambda match: match.name == 'subtitle_language.prefix', 0) if prefix: to_rename.append((prefix, lang)) to_remove.extend(matches.conflicting(lang)) if prefix in to_remove: to_remove.remove(prefix) if to_rename or to_remove: return to_rename, to_remove return False def then(self, matches, when_response, context): to_rename, to_remove = when_response super(SubtitlePrefixLanguageRule, self).then(matches, to_remove, context) for prefix, match in to_rename: # Remove suffix equivalent of prefix. suffix = copy.copy(prefix) suffix.name = 'subtitle_language.suffix' if suffix in matches: matches.remove(suffix) matches.remove(match) match.name = 'subtitle_language' matches.append(match) class SubtitleSuffixLanguageRule(Rule): """ Convert language guess as subtitle_language if next match is a subtitle language suffix """ dependency = SubtitlePrefixLanguageRule consequence = RemoveMatch properties = {'subtitle_language': [None]} def enabled(self, context): return not is_disabled(context, 'subtitle_language') def when(self, matches, context): to_append = [] to_remove = matches.named('subtitle_language.suffix') for lang in matches.named('language'): suffix = matches.next(lang, lambda match: match.name == 'subtitle_language.suffix', 0) if suffix: to_append.append(lang) if suffix in to_remove: to_remove.remove(suffix) if to_append or to_remove: return to_append, to_remove return False def then(self, matches, when_response, context): to_rename, to_remove = when_response super(SubtitleSuffixLanguageRule, self).then(matches, to_remove, context) for match in to_rename: matches.remove(match) match.name = 'subtitle_language' matches.append(match) class SubtitleExtensionRule(Rule): """ Convert language guess as subtitle_language if next match is a subtitle extension. Since it's a strong match, it also removes any conflicting source with it. """ consequence = [RemoveMatch, RenameMatch('subtitle_language')] properties = {'subtitle_language': [None]} def enabled(self, context): return not is_disabled(context, 'subtitle_language') def when(self, matches, context): # pylint:disable=inconsistent-return-statements subtitle_extension = matches.named('container', lambda match: 'extension' in match.tags and 'subtitle' in match.tags, 0) if subtitle_extension: subtitle_lang = matches.previous(subtitle_extension, lambda match: match.name == 'language', 0) if subtitle_lang: for weak in matches.named('subtitle_language', predicate=lambda m: 'weak-language' in m.tags): weak.private = True return matches.conflicting(subtitle_lang, lambda m: m.name == 'source'), subtitle_lang class RemoveLanguage(Rule): """Remove language matches that were not converted to subtitle_language when language is disabled.""" consequence = RemoveMatch def enabled(self, context): return is_disabled(context, 'language') def when(self, matches, context): return matches.named('language') class RemoveInvalidLanguages(Rule): """Remove language matches that matches the blacklisted common words.""" consequence = RemoveMatch priority = 32 def __init__(self, common_words): """Constructor.""" super(RemoveInvalidLanguages, self).__init__() self.common_words = common_words def when(self, matches, context): to_remove = [] for match in matches.range(0, len(matches.input_string), predicate=lambda m: m.name in ('language', 'subtitle_language')): if match.raw.lower() not in self.common_words: continue group = matches.markers.at_match(match, index=0, predicate=lambda m: m.name == 'group') if group and ( not matches.range( group.start, group.end, predicate=lambda m: m.name not in ('language', 'subtitle_language') ) and (not matches.holes(group.start, group.end, predicate=lambda m: m.value.strip(seps)))): continue to_remove.append(match) return to_remove