bazarr/libs/subliminal_patch/providers/hosszupuska.py

# coding: utf-8

from __future__ import absolute_import
import io
import six
import logging
import re
import os
import time

from babelfish import language_converters
from subzero.language import Language
from requests import Session

from subliminal_patch.providers import Provider
from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin
from subliminal.providers import ParserBeautifulSoup
from subliminal_patch.exceptions import ProviderError
from subliminal.score import get_equivalent_release_groups
from subliminal_patch.subtitle import Subtitle, guess_matches
from subliminal.utils import sanitize, sanitize_release_group
from subliminal.video import Episode
from zipfile import ZipFile, is_zipfile
from rarfile import RarFile, is_rarfile
from subliminal_patch.utils import sanitize, fix_inconsistent_naming as _fix_inconsistent_naming
from guessit import guessit


def fix_inconsistent_naming(title):
    """Fix titles with inconsistent naming using dictionary and sanitize them.

    :param str title: original title.
    :return: new title.
    :rtype: str

    """
    return _fix_inconsistent_naming(title, {"Stargate Origins": "Stargate: Origins",
                                            "Marvel's Agents of S.H.I.E.L.D.": "Marvels+Agents+of+S.H.I.E.L.D",
                                            "Mayans M.C.": "Mayans MC"}, True)


logger = logging.getLogger(__name__)

language_converters.register('hosszupuska = subliminal_patch.converters.hosszupuska:HosszupuskaConverter')

_SUB_ENGLISH_NAME_RE = re.compile(r's(\d{1,2})e(\d{1,2})')
_SUB_YEAR_RE = re.compile(r"(?<=\()(\d{4})(?=\))")


class HosszupuskaSubtitle(Subtitle):
    """Hosszupuska Subtitle."""
    provider_name = 'hosszupuska'

    def __str__(self):
        subtit = (f"Subtitle id: {self.subtitle_id} Series: {self.series} "
                  f"Season: {self.season} Episode: {self.episode} "
                  f"Releases: {self.releases}")
        if self.year:
            subtit = f"{subtit} Year: {self.year}"
        if six.PY3:
            return subtit
        return subtit.encode('utf-8')

    def __init__(self, language, page_link, subtitle_id, series, season, episode, version,
                 releases, year, asked_for_release_group=None, asked_for_episode=None):
        super(HosszupuskaSubtitle, self).__init__(language, page_link=page_link)
        self.subtitle_id = subtitle_id
        self.series = series
        self.season = season
        self.episode = episode
        self.version = version
        self.releases = releases
        self.year = year
        if year:
            self.year = int(year)

        self.release_info = u", ".join(releases)
        self.page_link = page_link
        self.asked_for_release_group = asked_for_release_group
        self.asked_for_episode = asked_for_episode

    def __repr__(self):
        ep_addon = (" S%02dE%02d" % (self.season, self.episode)) if self.episode else ""
        return '<%s %r [%s]>' % (
            self.__class__.__name__, u"%s%s%s [%s]" % (self.series, " (%s)" % self.year if self.year else "", ep_addon,
                                                       self.release_info), self.language)

    @property
    def id(self):
        return str(self.subtitle_id)

    def get_matches(self, video):
        matches = set()
        # series
        if video.series and ( sanitize(self.series) == sanitize(fix_inconsistent_naming(video.series)) or sanitize(self.series) == sanitize(video.series)):
            matches.add('series')
        # season
        if video.season and self.season == video.season:
            matches.add('season')
        # episode
        if video.episode and self.episode == video.episode:
            matches.add('episode')
        # year
        if ('series' in matches and video.original_series and self.year is None or
           video.year and video.year == self.year):
            matches.add('year')

        logger.debug("Matches: %s", matches)

        # release_group
        if (video.release_group and self.version and
                any(r in sanitize_release_group(self.version)
                    for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
            matches.add('release_group')

        matches |= guess_matches(video, guessit(self.release_info), {"type": "episode"})

        return matches


class HosszupuskaProvider(Provider, ProviderSubtitleArchiveMixin):
    """Hosszupuska Provider."""
    languages = {Language('hun', 'HU')} | {Language(l) for l in [
        'hun', 'eng'
    ]}
    video_types = (Episode,)
    server_url = 'http://hosszupuskasub.com/'
    subtitle_class = HosszupuskaSubtitle
    hearing_impaired_verifiable = False
    multi_result_throttle = 2  # seconds

    def initialize(self):
        self.session = Session()
        self.session.headers = {'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2")}

    def terminate(self):
        self.session.close()

    def get_language(self, text):
        if text == '1.gif':
            return Language.fromhosszupuska('hu')
        if text == '2.gif':
            return Language.fromhosszupuska('en')
        return None

    def query(self, series, season, episode, year=None, video=None):
        # Search for s01e03 instead of s1e3
        seasona = "%02d" % season
        episodea = "%02d" % episode
        seriesa = fix_inconsistent_naming(series)
        seriesa = series.replace(' ', '+')

        # get the episode page
        logger.info('Getting the page for episode %s', episode)
        url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \
            "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8"
        logger.info('Url %s', url)

        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])

        subtitles = []

        for num, temp in enumerate(soup.find_all("table")):
            if "this.style.backgroundImage='url(css/over2.jpg)" in str(temp) and "css/infooldal.png" in str(temp):
                logger.debug("Found valid table (%d index)", num)
                subtitles += self._loop_over_table(temp, season, episode, video)

        return subtitles

    def _loop_over_table(self, table, season, episode, video):
        i = 0
        for row in table.find_all("tr"):
            i = i + 1
            if "this.style.backgroundImage='url(css/over2.jpg)" in str(row): #and i > 5:
                datas = row.find_all("td")

                # Currently subliminal not use these params, but maybe later will come in handy
                # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0]
                # Translator of subtitle
                # sub_translator = datas[3].getText()
                # Posting date of subtitle
                # sub_date = datas[4].getText()

                sub_year = sub_english_name = sub_version = None
                # Handle the case when '(' in subtitle


                if datas[1].getText().count('(') == 1:
                    sub_english_name = _SUB_ENGLISH_NAME_RE.split(datas[1].getText())[3]
                if datas[1].getText().count('(') == 2:
                    sub_year = _SUB_YEAR_RE.findall(datas[1].getText().strip())[0]
                    sub_english_name = _SUB_ENGLISH_NAME_RE.split(datas[1].getText().split('(')[0])[0]

                if not sub_english_name:
                    continue

                sub_season = int((re.findall(r's(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                 .lstrip('0'))
                sub_episode = int((re.findall(r'e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                  .lstrip('0'))

                if sub_season == season and sub_episode == episode:
                    sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1])
                    sub_downloadlink = datas[6].find_all('a')[1]['href']
                    sub_id = sub_downloadlink.split('=')[1].split('.')[0]

                    if datas[1].getText().count('(') == 1:
                        sub_version = datas[1].getText().split('(')[1].split(')')[0]
                    if datas[1].getText().count('(') == 2:
                        sub_version = datas[1].getText().split('(')[2].split(')')[0]

                    # One subtitle can be used for several releases
                    sub_releases = [s.strip() for s in sub_version.split(',')]
                    subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(),
                                                   sub_season, sub_episode, sub_version, sub_releases, sub_year,
                                                   asked_for_release_group=video.release_group,
                                                   asked_for_episode=episode)

                    logger.debug('Found subtitle: %r', subtitle)
                    yield subtitle

    def list_subtitles(self, video, languages):
        titles = [video.series] + video.alternative_series

        for title in titles:
            subs = self.query(title, video.season, video.episode, video.year, video=video)
            if subs:
                return subs

            time.sleep(self.multi_result_throttle)
            return []

    def download_subtitle(self, subtitle):
        r = self.session.get(subtitle.page_link, timeout=10)
        r.raise_for_status()

        # open the archive
        archive_stream = io.BytesIO(r.content)
        if is_rarfile(archive_stream):
            logger.debug('Archive identified as rar')
            archive = RarFile(archive_stream)
        elif is_zipfile(archive_stream):
            logger.debug('Archive identified as zip')
            archive = ZipFile(archive_stream)
        else:
            raise ProviderError('Unidentified archive type')

        subtitle.content = self.get_subtitle_from_archive(subtitle, archive)