bazarr/libs/subliminal_patch/providers/legendasdivx.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
import io
import re
import os
import rarfile
import zipfile

from requests import Session
from guessit import guessit
from subliminal.exceptions import ConfigurationError, AuthenticationError, ServiceUnavailable, DownloadLimitExceeded
from subliminal_patch.providers import Provider
from subliminal.providers import ParserBeautifulSoup
from subliminal_patch.subtitle import Subtitle
from subliminal.video import Episode, Movie
from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending, guess_matches
from subzero.language import Language
from subliminal_patch.score import get_scores
from subliminal.utils import sanitize, sanitize_release_group

logger = logging.getLogger(__name__)

class LegendasdivxSubtitle(Subtitle):
    """Legendasdivx Subtitle."""
    provider_name = 'legendasdivx'

    def __init__(self, language, video, data):
        super(LegendasdivxSubtitle, self).__init__(language)
        self.language = language
        self.page_link = data['link']
        self.hits = data['hits']
        self.exact_match = data['exact_match']
        self.description = data['description']
        self.video = video
        self.video_filename = data['video_filename']
        self.uploader = data['uploader']

    @property
    def id(self):
        return self.page_link

    @property
    def release_info(self):
        return self.description

    def get_matches(self, video):
        matches = set()

        description = sanitize(self.description)

        if sanitize(self.video_filename) in description:
            matches.update(['title'])
            matches.update(['season'])
            matches.update(['episode'])

        # episode
        if video.title and sanitize(video.title) in description:
            matches.update(['title'])
        if video.year and '{:04d}'.format(video.year) in description:
            matches.update(['year'])

        if isinstance(video, Episode):
            # already matched in search query
            if video.season and 's{:02d}'.format(video.season) in description:
                matches.update(['season'])
            if video.episode and 'e{:02d}'.format(video.episode) in description:
                matches.update(['episode'])
            if video.episode and video.season and video.series:
                if '{} s{:02d}e{:02d}'.format(sanitize(video.series), video.season, video.episode) in description:
                    matches.update(['series'])
                    matches.update(['season'])
                    matches.update(['episode'])

        # release_group
        if video.release_group and sanitize_release_group(video.release_group) in sanitize_release_group(description):
            matches.update(['release_group'])

        # resolution
        if video.resolution and video.resolution.lower() in description:
            matches.update(['resolution'])

        # format
        formats = []
        if video.format:
            formats = [video.format.lower()]
            if formats[0] == "web-dl":
                formats.append("webdl")
                formats.append("webrip")
                formats.append("web")
            for frmt in formats:
                if frmt in description:
                    matches.update(['format'])
                    break

        # video_codec
        if video.video_codec:
            video_codecs = [video.video_codec.lower()]
            if video_codecs[0] == "h264":
                video_codecs.append("x264")
            elif video_codecs[0] == "h265":
                video_codecs.append("x265")
            for vc in video_codecs:
                if vc in description:
                    matches.update(['video_codec'])
                    break

        # running guessit on a huge description may break guessit
        # matches |= guess_matches(video, guessit(self.description))
        return matches

class LegendasdivxProvider(Provider):
    """Legendasdivx Provider."""
    languages = {Language('por', 'BR')} | {Language('por')}
    SEARCH_THROTTLE = 8
    site = 'https://www.legendasdivx.pt'
    headers = {
        'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2"),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Origin': 'https://www.legendasdivx.pt',
        'Referer': 'https://www.legendasdivx.pt',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache'
    }
    loginpage = site + '/forum/ucp.php?mode=login'
    logoutpage = site + '/sair.php'
    searchurl = site + '/modules.php?name=Downloads&file=jz&d_op=search&op=_jz00&query={query}'
    download_link = site + '/modules.php{link}'

    def __init__(self, username, password):
        # make sure login credentials are configured.
        if any((username, password)) and not all((username, password)):
            raise ConfigurationError('Username and password must be specified')
        self.username = username
        self.password = password
        self.logged_in = False

    def initialize(self):
        self.session = Session()
        self.session.headers.update(self.headers)
        self.login()

    def terminate(self):
        self.logout()
        self.session.close()

    def login(self):
        logger.info('Logging in')
        
        res = self.session.get(self.loginpage)
        bsoup = ParserBeautifulSoup(res.content, ['lxml'])
        
        _allinputs = bsoup.findAll('input')
        data = {}
        # necessary to set 'sid' for POST request
        for field in _allinputs:
            data[field.get('name')] = field.get('value')
        
        data['username'] = self.username
        data['password'] = self.password

        res = self.session.post(self.loginpage, data)
        res.raise_for_status()
        
        try:
            logger.debug('Logged in successfully: PHPSESSID: %s' %
                         self.session.cookies.get_dict()['PHPSESSID'])
            self.logged_in = True   
        except KeyError:
            logger.error("Couldn't retrieve session ID, check your credentials")
            raise AuthenticationError("Please check your credentials.")
        except Exception as e:
            if (res and 'bloqueado' in res.text.lower()): # blocked IP address 
                logger.error("LegendasDivx.pt :: Your IP is blocked on this server.")
                raise ParseResponseError("Legendasdivx.pt :: %r" % res.text)
            logger.error("LegendasDivx.pt :: Uncaught error: %r" % repr(e))
            raise ServiceUnavailable("LegendasDivx.pt :: Uncaught error: %r" % repr(e))

    def logout(self):
        if self.logged_in:
            logger.info('Legendasdivx:: Logging out')
            r = self.session.get(self.logoutpage, timeout=10)
            r.raise_for_status()
            logger.debug('Legendasdivx :: Logged out')
            self.logged_in = False

    def _process_page(self, video, bsoup, video_filename):

        subtitles = []

        _allsubs = bsoup.findAll("div", {"class": "sub_box"})

        for _subbox in _allsubs:
            hits = 0
            for th in _subbox.findAll("th", {"class": "color2"}):
                if th.text == 'Hits:':
                    hits = int(th.parent.find("td").text)
                if th.text == 'Idioma:':
                    lang = th.parent.find("td").find("img").get('src')
                    if 'brazil' in lang.lower():
                        lang = Language.fromopensubtitles('pob')
                    elif 'portugal' in lang.lower():
                        lang = Language.fromopensubtitles('por')
                    else:
                        continue
            # get description for matches
            description = _subbox.find("td", {"class": "td_desc brd_up"}).get_text()
            #get subtitle link
            download = _subbox.find("a", {"class": "sub_download"})
            
            # sometimes BSoup can't find 'a' tag and returns None. 
            try:
                dl = download.get('href')
                logger.debug("Found subtitle link on: {0}").format(self.download_link.format(link=dl))
            except:
                logger.debug("Couldn't find download link. Trying next...")
                continue

            # get subtitle uploader
            sub_header = _subbox.find("div", {"class" :"sub_header"}) 
            uploader = sub_header.find("a").text if sub_header else 'anonymous'

            exact_match = False
            if video.name.lower() in description.lower():
                exact_match = True
            data = {'link': self.site + '/modules.php' + download.get('href'),
                    'exact_match': exact_match,
                    'hits': hits,
                    'uploader': uploader,
                    'video_filename': video_filename,
                    'description': description
                    }
            subtitles.append(
                LegendasdivxSubtitle(lang, video, data)
            )
        return subtitles

    def query(self, video, languages):

        video_filename = video.name
        video_filename = os.path.basename(video_filename)
        video_filename, _ = os.path.splitext(video_filename)
        video_filename = sanitize_release_group(video_filename)

        _searchurl = self.searchurl
        if video.imdb_id is None:
            if isinstance(video, Episode):
                querytext = "{} S{:02d}E{:02d}".format(video.series, video.season, video.episode)
            elif isinstance(video, Movie):
                querytext = video.title
        else:
            querytext = video.imdb_id

        # language query filter
        if isinstance(languages, (tuple, list, set)):
            language_ids = ','.join(sorted(l.opensubtitles for l in languages))
            if 'por' in language_ids: # prioritize portuguese subtitles
                lang_filter = '&form_cat=28' # pt
            elif 'pob' in language_ids:
                lang_filter = '&form_cat=29' # br
            else:
                lang_filter = ''

        querytext = querytext + lang_filter if lang_filter else querytext

        self.headers['Referer'] = self.site + '/index.php'
        self.session.headers.update(self.headers.items())
        res = self.session.get(_searchurl.format(query=querytext))

        if (res and "A legenda não foi encontrada" in res.text):
            logger.warning('%s not found', querytext)
            return []

        bsoup = ParserBeautifulSoup(res.content, ['html.parser'])
        subtitles = self._process_page(video, bsoup, video_filename)

        # search for more than 10 results (legendasdivx uses pagination)
        # don't throttle - maximum results = 6 * 10
        MAX_PAGES = 6
        
        #get number of pages bases on results found
        page_header = bsoup.find("div", {"class": "pager_bar"})
        results_found = re.search(r'\((.*?) encontradas\)', page_header.text).group(1) if page_header else 0
        logger.debug("Found %s subtitles" % str(results_found))
        num_pages = (int(results_found) // 10) + 1
        num_pages = min(MAX_PAGES, num_pages)

        if num_pages > 1:
            for num_page in range(2, num_pages+1):
                _search_next = self.searchurl.format(query=querytext) + "&page={0}".format(str(num_page))
                logger.debug("Moving to next page: %s" % _search_next)
                res = self.session.get(_search_next)
                next_page = ParserBeautifulSoup(res.content, ['html.parser'])
                subs = self._process_page(video, next_page, video_filename)
                subtitles.extend(subs)

        return subtitles

    def list_subtitles(self, video, languages):
        return self.query(video, languages)

    def download_subtitle(self, subtitle):
        res = self.session.get(subtitle.page_link)
        res.raise_for_status()
        if res:
            if res.status_code in ['500', '503']:
                raise ServiceUnavailable("Legendasdivx.pt :: 503 - Service Unavailable")
            elif res.status_code == '403':
                raise ParseResponseError("Legendasdivx.pt :: 403 - Forbidden")
            elif 'limite' in res.text.lower(): # daily downloads limit reached
                raise DownloadLimitReached("Legendasdivx.pt :: Download limit reached")
            elif 'bloqueado' in res.text.lower(): # blocked IP address 
                raise ParseResponseError("Legendasdivx.pt :: %r" % res.text)

            archive = self._get_archive(res.content)
            # extract the subtitle
            subtitle_content = self._get_subtitle_from_archive(archive, subtitle)
            subtitle.content = fix_line_ending(subtitle_content)
            subtitle.normalize()

            return subtitle

        logger.error("Legendasdivx.pt :: there was a problem retrieving subtitle (status %s)" % res.status_code)
        return

    def _get_archive(self, content):
        # open the archive
        # stole^H^H^H^H^H inspired from subvix provider
        archive_stream = io.BytesIO(content)
        if rarfile.is_rarfile(archive_stream):
            logger.debug('Identified rar archive')
            archive = rarfile.RarFile(archive_stream)
        elif zipfile.is_zipfile(archive_stream):
            logger.debug('Identified zip archive')
            archive = zipfile.ZipFile(archive_stream)
        else:
            raise Exception('Unsupported compressed format')

        return archive

    def _get_subtitle_from_archive(self, archive, subtitle):
        # some files have a non subtitle with .txt extension
        _tmp = list(SUBTITLE_EXTENSIONS)
        _tmp.remove('.txt')
        _subtitle_extensions = tuple(_tmp)
        _max_score = 0
        _scores = get_scores(subtitle.video)

        for name in archive.namelist():
            # discard hidden files
            if os.path.split(name)[-1].startswith('.'):
                continue

            # discard non-subtitle files
            if not name.lower().endswith(_subtitle_extensions):
                continue

            _guess = guessit (name)
            if isinstance(subtitle.video, Episode):
                logger.debug ("guessing %s" % name)
                logger.debug("subtitle S{}E{} video S{}E{}".format(_guess['season'],_guess['episode'],subtitle.video.season,subtitle.video.episode))

                if subtitle.video.episode != _guess['episode'] or subtitle.video.season != _guess['season']:
                    logger.debug('subtitle does not match video, skipping')
                    continue

            matches = set()
            matches |= guess_matches (subtitle.video, _guess)
            logger.debug('srt matches: %s' % matches)
            _score = sum ((_scores.get (match, 0) for match in matches))
            if _score > _max_score:
                _max_name = name
                _max_score = _score
                logger.debug("new max: {} {}".format(name, _score))

        if _max_score > 0:
            logger.debug("returning from archive: {} scored {}".format(_max_name, _max_score))
            return archive.read(_max_name)

        raise ValueError("No subtitle found on compressed file. Max score was 0")