bazarr/libs/guessit/rules/properties/website.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Website property.
"""
try:
    from importlib.resources import files  # @UnresolvedImport
except ImportError:
    from importlib_resources import files  # @UnresolvedImport

from rebulk.remodule import re

from rebulk import Rebulk, Rule, RemoveMatch
from ..common import seps
from ..common.formatters import cleanup
from ..common.pattern import is_disabled
from ..common.validators import seps_surround
from ...reutils import build_or_pattern


def website(config):
    """
    Builder for rebulk object.

    :param config: rule configuration
    :type config: dict
    :return: Created Rebulk object
    :rtype: Rebulk
    """
    rebulk = Rebulk(disabled=lambda context: is_disabled(context, 'website'))
    rebulk = rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)
    rebulk.defaults(name="website")

    data_files = files('guessit.data')
    tld_file = data_files.joinpath('tlds-alpha-by-domain.txt').read_text(encoding='utf-8')
    tlds = [
        tld.strip()
        for tld in tld_file.split('\n')
        if '--' not in tld
    ][1:]  # All registered domain extension

    safe_tlds = config['safe_tlds']  # For sure a website extension
    safe_subdomains = config['safe_subdomains']  # For sure a website subdomain
    safe_prefix = config['safe_prefixes']  # Those words before a tlds are sure
    website_prefixes = config['prefixes']

    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
                 r'\.)+(?:[a-z-0-9-]+\.)+(?:'+build_or_pattern(tlds) +
                 r'))(?:[^a-z0-9]|$)',
                 children=True)
    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
                 r'\.)*[a-z0-9-]+\.(?:'+build_or_pattern(safe_tlds) +
                 r'))(?:[^a-z0-9]|$)',
                 safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True)
    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
                 r'\.)*[a-z0-9-]+\.(?:'+build_or_pattern(safe_prefix) +
                 r'\.)+(?:'+build_or_pattern(tlds) +
                 r'))(?:[^a-z0-9]|$)',
                 safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True)

    rebulk.string(*website_prefixes,
                  validator=seps_surround, private=True, tags=['website.prefix'])

    class PreferTitleOverWebsite(Rule):
        """
        If found match is more likely a title, remove website.
        """
        consequence = RemoveMatch

        @staticmethod
        def valid_followers(match):
            """
            Validator for next website matches
            """
            return match.named('season', 'episode', 'year')

        def when(self, matches, context):
            to_remove = []
            for website_match in matches.named('website'):
                safe = False
                for safe_start in safe_subdomains + safe_prefix:
                    if website_match.value.lower().startswith(safe_start):
                        safe = True
                        break
                if not safe:
                    suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0)
                    if suffix:
                        group = matches.markers.at_match(website_match, lambda marker: marker.name == 'group', 0)
                        if not group:
                            to_remove.append(website_match)
            return to_remove

    rebulk.rules(PreferTitleOverWebsite, ValidateWebsitePrefix)

    return rebulk


class ValidateWebsitePrefix(Rule):
    """
    Validate website prefixes
    """
    priority = 64
    consequence = RemoveMatch

    def when(self, matches, context):
        to_remove = []
        for prefix in matches.tagged('website.prefix'):
            website_match = matches.next(prefix, predicate=lambda match: match.name == 'website', index=0)
            if (not website_match or
                    matches.holes(prefix.end, website_match.start,
                                  formatter=cleanup, seps=seps, predicate=lambda match: match.value)):
                to_remove.append(prefix)
        return to_remove
Include dependencies and remove requirements.txt 6 years ago			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`"""`
			`Website property.`
			`"""`
Upgraded guessit to latest version and include new dependencies. 3 years ago			`try:`
			`from importlib.resources import files # @UnresolvedImport`
			`except ImportError:`
			`from importlib_resources import files # @UnresolvedImport`

Include dependencies and remove requirements.txt 6 years ago			`from rebulk.remodule import re`

			`from rebulk import Rebulk, Rule, RemoveMatch`
			`from ..common import seps`
			`from ..common.formatters import cleanup`
Upgraded GuessIt to 3.0.1 5 years ago			`from ..common.pattern import is_disabled`
Include dependencies and remove requirements.txt 6 years ago			`from ..common.validators import seps_surround`
			`from ...reutils import build_or_pattern`


Upgraded GuessIt to 3.0.1 5 years ago			`def website(config):`
Include dependencies and remove requirements.txt 6 years ago			`"""`
			`Builder for rebulk object.`
Upgraded GuessIt to 3.0.1 5 years ago
			`:param config: rule configuration`
			`:type config: dict`
Include dependencies and remove requirements.txt 6 years ago			`:return: Created Rebulk object`
			`:rtype: Rebulk`
			`"""`
Upgraded GuessIt to 3.0.1 5 years ago			`rebulk = Rebulk(disabled=lambda context: is_disabled(context, 'website'))`
			`rebulk = rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)`
Include dependencies and remove requirements.txt 6 years ago			`rebulk.defaults(name="website")`

Updated multiple Python modules (now in libs and custom_libs directories) and React libraries 9 months ago			`data_files = files('guessit.data')`
			`tld_file = data_files.joinpath('tlds-alpha-by-domain.txt').read_text(encoding='utf-8')`
			`tlds = [`
			`tld.strip()`
			`for tld in tld_file.split('\n')`
			`if '--' not in tld`
			`][1:] # All registered domain extension`
Moving back to GuessIt 2.1.4. Too many bugs in 3.0.0. 6 years ago
Upgraded GuessIt to 3.0.1 5 years ago			`safe_tlds = config['safe_tlds'] # For sure a website extension`
			`safe_subdomains = config['safe_subdomains'] # For sure a website subdomain`
			`safe_prefix = config['safe_prefixes'] # Those words before a tlds are sure`
			`website_prefixes = config['prefixes']`
Include dependencies and remove requirements.txt 6 years ago
			`rebulk.regex(r'(?:[^a-z0-9]\|^)((?:'+build_or_pattern(safe_subdomains) +`
Upgraded guessit to latest version and include new dependencies. 3 years ago			`r'\.)+(?:[a-z-0-9-]+\.)+(?:'+build_or_pattern(tlds) +`
Include dependencies and remove requirements.txt 6 years ago			`r'))(?:[^a-z0-9]\|$)',`
			`children=True)`
			`rebulk.regex(r'(?:[^a-z0-9]\|^)((?:'+build_or_pattern(safe_subdomains) +`
Upgraded guessit to latest version and include new dependencies. 3 years ago			`r'\.)*[a-z0-9-]+\.(?:'+build_or_pattern(safe_tlds) +`
Include dependencies and remove requirements.txt 6 years ago			`r'))(?:[^a-z0-9]\|$)',`
			`safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True)`
			`rebulk.regex(r'(?:[^a-z0-9]\|^)((?:'+build_or_pattern(safe_subdomains) +`
Upgraded guessit to latest version and include new dependencies. 3 years ago			`r'\.)*[a-z0-9-]+\.(?:'+build_or_pattern(safe_prefix) +`
Include dependencies and remove requirements.txt 6 years ago			`r'\.)+(?:'+build_or_pattern(tlds) +`
			`r'))(?:[^a-z0-9]\|$)',`
			`safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True)`

			`rebulk.string(*website_prefixes,`
			`validator=seps_surround, private=True, tags=['website.prefix'])`

			`class PreferTitleOverWebsite(Rule):`
			`"""`
			`If found match is more likely a title, remove website.`
			`"""`
			`consequence = RemoveMatch`

			`@staticmethod`
			`def valid_followers(match):`
			`"""`
			`Validator for next website matches`
			`"""`
Upgraded GuessIt to 3.0.1 5 years ago			`return match.named('season', 'episode', 'year')`
Include dependencies and remove requirements.txt 6 years ago
			`def when(self, matches, context):`
			`to_remove = []`
			`for website_match in matches.named('website'):`
			`safe = False`
			`for safe_start in safe_subdomains + safe_prefix:`
			`if website_match.value.lower().startswith(safe_start):`
			`safe = True`
			`break`
			`if not safe:`
			`suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0)`
			`if suffix:`
Upgraded GuessIt to 3.0.1 5 years ago			`group = matches.markers.at_match(website_match, lambda marker: marker.name == 'group', 0)`
			`if not group:`
			`to_remove.append(website_match)`
Include dependencies and remove requirements.txt 6 years ago			`return to_remove`

			`rebulk.rules(PreferTitleOverWebsite, ValidateWebsitePrefix)`

			`return rebulk`


			`class ValidateWebsitePrefix(Rule):`
			`"""`
			`Validate website prefixes`
			`"""`
			`priority = 64`
			`consequence = RemoveMatch`

			`def when(self, matches, context):`
			`to_remove = []`
			`for prefix in matches.tagged('website.prefix'):`
			`website_match = matches.next(prefix, predicate=lambda match: match.name == 'website', index=0)`
			`if (not website_match or`
			`matches.holes(prefix.end, website_match.start,`
			`formatter=cleanup, seps=seps, predicate=lambda match: match.value)):`
			`to_remove.append(prefix)`
			`return to_remove`