Fix zimuku provider

5 years ago · b12cb42146
parent fd339b5fff
commit b12cb42146
1 changed files with 259 additions and 107 deletions
--- a/libs/subliminal_patch/providers/zimuku.py
+++ b/libs/subliminal_patch/providers/zimuku.py
@ -4,6 +4,13 @@ import io
 import logging
 import os
 import zipfile
 import re
 import copy
 try:
    from urlparse import urljoin
 except ImportError:
    from urllib.parse import urljoin
 import rarfile
 from subzero.language import Language
@ -13,7 +20,12 @@ from six import text_type
 from subliminal import __short_version__
 from subliminal.providers import ParserBeautifulSoup, Provider
-from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches
+from subliminal.subtitle import (
    SUBTITLE_EXTENSIONS,
    Subtitle,
    fix_line_ending,
    guess_matches,
 )
 from subliminal.video import Episode, Movie
 logger = logging.getLogger(__name__)
@ -21,43 +33,50 @@ logger = logging.getLogger(__name__)
 class ZimukuSubtitle(Subtitle):
    """Zimuku Subtitle."""
    provider_name = 'zimuku'
-    def __init__(self, language, page_link, version, download_link):
+    provider_name = "zimuku"
    def __init__(self, language, page_link, version, session):
        super(ZimukuSubtitle, self).__init__(language, page_link=page_link)
        self.version = version
-        self.download_link = download_link
+        self.hearing_impaired = False
-        self.hearing_impaired = None
+        self.encoding = "utf-8"
-        self.encoding = 'utf-8'
+        self.session = session
    @property
    def id(self):
-        return self.download_link
+        return self.version
    def get_matches(self, video):
        matches = set()
        # episode
        if isinstance(video, Episode):
            # always make year a match
            info = guessit(self.version, {"type": "episode"})
            info["year"] = video.year
            # other properties
-            matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True)
+            matches |= guess_matches(video, info, partial=True)
        # movie
        elif isinstance(video, Movie):
            # other properties
-            matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True)
+            matches |= guess_matches(
                video, guessit(self.version, {"type": "movie"}), partial=True
            )
        return matches
 class ZimukuProvider(Provider):
    """Zimuku Provider."""
    languages = {Language(l) for l in ['zho', 'eng']}
-    server_url = 'http://www.zimuku.la'
+    languages = {Language(l) for l in ["zho", "eng"]}
-    search_url = '/search?q={}'
+
-    download_url = 'http://www.zimuku.la/'
+    server_url = "http://www.zimuku.la"
    search_url = "/search?q={}"
    download_url = "http://www.zimuku.la/"
-    UserAgent  = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
+    UserAgent = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
    subtitle_class = ZimukuSubtitle
@ -66,19 +85,60 @@ class ZimukuProvider(Provider):
    def initialize(self):
        self.session = Session()
-        self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
+        self.session.headers["User-Agent"] = "Subliminal/{}".format(__short_version__)
    def terminate(self):
        self.session.close()
    def _parse_episode_page(self, link):
        r = self.session.get(link)
        bs_obj = ParserBeautifulSoup(
            r.content.decode("utf-8", "ignore"), ["html.parser"]
        )
        subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody")
        subs, lan_scores = [], {}
        for sub in subs_body.find_all("tr"):
            a = sub.find("a")
            name = _extract_name(a.text)
            name = os.path.splitext(name)[
                0
            ]  # remove ext because it can be an archive type
            lan_score = 0
            for img in sub.find("td", class_="tac lang").find_all("img"):
                if "uk" in img.attrs["src"]:
                    lan_score += 1
                elif "hongkong" in img.attrs["src"]:
                    lan_score += 2
                elif "china" in img.attrs["src"]:
                    lan_score += 4
                elif "jollyroger" in img.attrs["src"]:
                    lan_score += 8
            if lan_score == 1:
                language = Language("eng")
            else:
                language = Language("zho")
            sub_page_link = urljoin(self.server_url, a.attrs["href"])
            backup_session = copy.deepcopy(self.session)
            backup_session.headers["Referer"] = link
            subs.append(
                self.subtitle_class(language, sub_page_link, name, backup_session)
            )
            lan_scores[name] = lan_score
        # prefer double languages
        subs.sort(key=lambda s: lan_scores[s.version], reverse=True)
        return subs
    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
-        if season and episode:
+        if season:
-            params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode)
+            params += ".S{season:02d}".format(season=season)
        elif year:
-            params += ' {:4d}'.format(year)
+            params += " {:4d}".format(year)
-        logger.debug('Searching subtitles %r', params)
+        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(self.search_url).format(params)
@ -86,45 +146,33 @@ class ZimukuProvider(Provider):
        r.raise_for_status()
        if not r.content:
-            logger.debug('No data returned from provider')
+            logger.debug("No data returned from provider")
            return []
-        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
+        soup = ParserBeautifulSoup(
-
+            r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]
-        for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'):
+        )
-            moviename = entity.text
+
-            entity_url = self.server_url + entity['href']
+        # non-shooter result page
-            logger.debug(entity_url)
+        if soup.find("div", {"class": "item"}):
-            r = self.session.get(entity_url, timeout=30)
+            logger.debug("enter a non-shooter page")
-            r.raise_for_status()
+            for item in soup.find_all("div", {"class": "item"}):
-            logger.debug('looking into ' + entity_url)
+                title_a = item.find("p", class_="tt clearfix").find("a")
-
+                if season:
-            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("div", class_="subs box clearfix")
+                    title = title_a.text
-            # loop over subtitles cells
+                    season_cn1 = re.search("第(.*)季", title)
-
+                    if not season_cn1:
-            subs = soup.tbody.find_all("tr")
+                        season_cn1 = "一"
            for sub in subs:
                page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8'))
                version = sub.a.text.encode('utf-8') or None
                if version is None:
                    version = ""
                try:
                    td = sub.find("td", class_="tac lang")
                    r2 = td.find_all("img")
                    langs = [x.get('title').encode('utf-8') for x in r2]
                except:
                    langs = '未知' 
                name = '%s (%s)' % (version, ",".join(langs))
                if ('English' in langs) and not(('简体中文' in langs) or ('繁體中文' in langs)):
                    language = Language('eng')
                    else:
-                    language = Language('zho')
+                        season_cn1 = season_cn1.group(1).strip()
-                # read the item
+                    season_cn2 = num_to_cn(str(season))
-                subtitle = self.subtitle_class(language, page_link, version, page_link.replace("detail","dld"))
+                    if season_cn1 != season_cn2:
                        continue
                episode_link = self.server_url + title_a.attrs["href"]
                new_subs = self._parse_episode_page(episode_link)
                subtitles += new_subs
-                logger.debug('Found subtitle %r', subtitle)
+        # NOTE: shooter result pages are ignored due to the existence of assrt provider
                subtitles.append(subtitle)
        return subtitles
@ -140,70 +188,174 @@ class ZimukuProvider(Provider):
        # query for subtitles with the show_id
        for title in titles:
            if isinstance(video, Episode):
-                subtitles += [s for s in self.query(title, season=video.season, episode=video.episode,
+                subtitles += [
-                                                    year=video.year)
+                    s
-                              if s.language in languages]
+                    for s in self.query(
                        title,
                        season=video.season,
                        episode=video.episode,
                        year=video.year,
                    )
                    if s.language in languages
                ]
            elif isinstance(video, Movie):
-                subtitles += [s for s in self.query(title, year=video.year)
+                subtitles += [
-                              if s.language in languages]
+                    s
                    for s in self.query(title, year=video.year)
                    if s.language in languages
                ]
        return subtitles
    def download_subtitle(self, subtitle):
-        if isinstance(subtitle, ZimukuSubtitle):
+        def _get_archive_dowload_link(session, sub_page_link):
            r = session.get(sub_page_link)
            bs_obj = ParserBeautifulSoup(
                r.content.decode("utf-8", "ignore"), ["html.parser"]
            )
            down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"]
            down_page_link = urljoin(sub_page_link, down_page_link)
            r = session.get(down_page_link)
            bs_obj = ParserBeautifulSoup(
                r.content.decode("utf-8", "ignore"), ["html.parser"]
            )
            download_link = bs_obj.find("a", {"rel": "nofollow"})
            download_link = download_link.attrs["href"]
            download_link = urljoin(sub_page_link, download_link)
            return download_link
        # download the subtitle
-            logger.info('Downloading subtitle %r', subtitle)
+        logger.info("Downloading subtitle %r", subtitle)
-            r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link},
+        self.session = subtitle.session
-                                 timeout=30)
+        download_link = _get_archive_dowload_link(self.session, subtitle.page_link)
        r = self.session.get(download_link, timeout=30)
        r.raise_for_status()
        filename = r.headers["Content-Disposition"]
        if not r.content:
-                logger.debug('Unable to download subtitle. No data returned from provider')
+            logger.debug("Unable to download subtitle. No data returned from provider")
            return
            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
            links = soup.find("div", {"class":"clearfix"}).find_all('a')
            # TODO: add settings for choice
            for down_link in links:
                url = down_link.get('href').encode('utf-8')
                url = self.server_url + url
                r = self.session.get(url, headers={'Referer': subtitle.download_link},
                                 timeout=30)
                r.raise_for_status()
                if len(r.content) > 1024:
                    break
        archive_stream = io.BytesIO(r.content)
        archive = None
        if rarfile.is_rarfile(archive_stream):
-                logger.debug('Identified rar archive')
+            logger.debug("Identified rar archive")
            if ".rar" not in filename:
                logger.debug(
                    ".rar should be in the downloaded file name: {}".format(filename)
                )
                return
            archive = rarfile.RarFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        elif zipfile.is_zipfile(archive_stream):
-                logger.debug('Identified zip archive')
+            logger.debug("Identified zip archive")
            if ".zip" not in filename:
                logger.debug(
                    ".zip should be in the downloaded file name: {}".format(filename)
                )
                return
            archive = zipfile.ZipFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        else:
            is_sub = ""
            for sub_ext in SUBTITLE_EXTENSIONS:
                if sub_ext in filename:
                    is_sub = sub_ext
                    break
            if not is_sub:
                logger.debug(
                    "unknown subtitle ext int downloaded file name: {}".format(filename)
                )
                return
            logger.debug("Identified {} file".format(is_sub))
            subtitle_content = r.content
        if subtitle_content:
            subtitle.content = fix_line_ending(subtitle_content)
        else:
-                logger.debug('Could not extract subtitle from %r', archive)
+            logger.debug("Could not extract subtitle from %r", archive)
 def _get_subtitle_from_archive(archive):
-    for name in archive.namelist():
+    extract_subname, max_score = "", -1
    for subname in archive.namelist():
        # discard hidden files
-        if os.path.split(name)[-1].startswith('.'):
+        if os.path.split(subname)[-1].startswith("."):
            continue
        # discard non-subtitle files
-        if not name.lower().endswith(SUBTITLE_EXTENSIONS):
+        if not subname.lower().endswith(SUBTITLE_EXTENSIONS):
            continue
-        return archive.read(name)
+        # prefer ass/ssa subtitles with double languages or simplified chinese
-
+        score = ("ass" in subname or "ssa" in subname) * 1
-    return None
+        if "简体" in subname or "chs" in subname or ".gb." in subname:
            score += 2
        if "繁体" in subname or "cht" in subname or ".big5." in subname:
            pass
        if "chs.eng" in subname or "chs&eng" in subname:
            score += 2
        if "中英" in subname or "简英" in subname or "双语" in subname or "简体&英文" in subname:
            score += 4
        logger.debug("subtitle {}, score: {}".format(subname, score))
        if score > max_score:
            max_score = score
            extract_subname = subname
    return archive.read(extract_subname) if max_score != -1 else None
 def _extract_name(name):
    """ filter out Chinese characters from subtitle names """
    name, suffix = os.path.splitext(name)
    c_pattern = "[\u4e00-\u9fff]"
    e_pattern = "[a-zA-Z]"
    c_indices = [m.start(0) for m in re.finditer(c_pattern, name)]
    e_indices = [m.start(0) for m in re.finditer(e_pattern, name)]
    target, discard = e_indices, c_indices
    if len(target) == 0:
        return ""
    first_target, last_target = target[0], target[-1]
    first_discard = discard[0] if discard else -1
    last_discard = discard[-1] if discard else -1
    if last_discard < first_target:
        new_name = name[first_target:]
    elif last_target < first_discard:
        new_name = name[:first_discard]
    else:
        # try to find maximum continous part
        result, start, end = [0, 1], -1, 0
        while end < len(name):
            while end not in e_indices and end < len(name):
                end += 1
            if end == len(name):
                break
            start = end
            while end not in c_indices and end < len(name):
                end += 1
            if end - start > result[1] - result[0]:
                result = [start, end]
                print(result)
            start = end
            end += 1
        new_name = name[result[0] : result[1]]
    new_name = new_name.strip() + suffix
    return new_name
 def num_to_cn(number):
    """ convert numbers(1-99) to Chinese """
    assert number.isdigit() and 1 <= int(number) <= 99
    trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))}
    if len(number) == 1:
        return trans_map[number]
    else:
        part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十"
        part2 = trans_map[number[1]] if number[1] != "0" else ""
        return part1 + part2