Fix zimuku provider

5 years ago · b12cb42146
parent fd339b5fff
commit b12cb42146
1 changed files with 259 additions and 107 deletions
--- a/libs/subliminal_patch/providers/zimuku.py
+++ b/libs/subliminal_patch/providers/zimuku.py
@ -4,6 +4,13 @@ import io
 import logging
 import os
 import zipfile
+import re
+import copy
+
+try:
+    from urlparse import urljoin
+except ImportError:
+    from urllib.parse import urljoin

 import rarfile
 from subzero.language import Language
@ -13,7 +20,12 @@ from six import text_type

 from subliminal import __short_version__
 from subliminal.providers import ParserBeautifulSoup, Provider
-from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches
+from subliminal.subtitle import (
+    SUBTITLE_EXTENSIONS,
+    Subtitle,
+    fix_line_ending,
+    guess_matches,
+)
 from subliminal.video import Episode, Movie

 logger = logging.getLogger(__name__)
@ -21,43 +33,50 @@ logger = logging.getLogger(__name__)

 class ZimukuSubtitle(Subtitle):
    """Zimuku Subtitle."""
-    provider_name = 'zimuku'

-    def __init__(self, language, page_link, version, download_link):
+    provider_name = "zimuku"
+
+    def __init__(self, language, page_link, version, session):
        super(ZimukuSubtitle, self).__init__(language, page_link=page_link)
        self.version = version
-        self.download_link = download_link
-        self.hearing_impaired = None
-        self.encoding = 'utf-8'
+        self.hearing_impaired = False
+        self.encoding = "utf-8"
+        self.session = session

    @property
    def id(self):
-        return self.download_link
+        return self.version

    def get_matches(self, video):
        matches = set()

        # episode
        if isinstance(video, Episode):
+            # always make year a match
+            info = guessit(self.version, {"type": "episode"})
+            info["year"] = video.year
            # other properties
-            matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True)
+            matches |= guess_matches(video, info, partial=True)
        # movie
        elif isinstance(video, Movie):
            # other properties
-            matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True)
+            matches |= guess_matches(
+                video, guessit(self.version, {"type": "movie"}), partial=True
+            )

        return matches


 class ZimukuProvider(Provider):
    """Zimuku Provider."""
-    languages = {Language(l) for l in ['zho', 'eng']}

-    server_url = 'http://www.zimuku.la'
-    search_url = '/search?q={}'
-    download_url = 'http://www.zimuku.la/'
+    languages = {Language(l) for l in ["zho", "eng"]}
+
+    server_url = "http://www.zimuku.la"
+    search_url = "/search?q={}"
+    download_url = "http://www.zimuku.la/"

-    UserAgent  = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
+    UserAgent = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"

    subtitle_class = ZimukuSubtitle

@ -66,19 +85,60 @@ class ZimukuProvider(Provider):

    def initialize(self):
        self.session = Session()
-        self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
+        self.session.headers["User-Agent"] = "Subliminal/{}".format(__short_version__)

    def terminate(self):
        self.session.close()

+    def _parse_episode_page(self, link):
+        r = self.session.get(link)
+        bs_obj = ParserBeautifulSoup(
+            r.content.decode("utf-8", "ignore"), ["html.parser"]
+        )
+        subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody")
+        subs, lan_scores = [], {}
+        for sub in subs_body.find_all("tr"):
+            a = sub.find("a")
+            name = _extract_name(a.text)
+            name = os.path.splitext(name)[
+                0
+            ]  # remove ext because it can be an archive type
+
+            lan_score = 0
+            for img in sub.find("td", class_="tac lang").find_all("img"):
+                if "uk" in img.attrs["src"]:
+                    lan_score += 1
+                elif "hongkong" in img.attrs["src"]:
+                    lan_score += 2
+                elif "china" in img.attrs["src"]:
+                    lan_score += 4
+                elif "jollyroger" in img.attrs["src"]:
+                    lan_score += 8
+            if lan_score == 1:
+                language = Language("eng")
+            else:
+                language = Language("zho")
+            sub_page_link = urljoin(self.server_url, a.attrs["href"])
+            backup_session = copy.deepcopy(self.session)
+            backup_session.headers["Referer"] = link
+
+            subs.append(
+                self.subtitle_class(language, sub_page_link, name, backup_session)
+            )
+            lan_scores[name] = lan_score
+
+        # prefer double languages
+        subs.sort(key=lambda s: lan_scores[s.version], reverse=True)
+        return subs
+
    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
-        if season and episode:
-            params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode)
+        if season:
+            params += ".S{season:02d}".format(season=season)
        elif year:
-            params += ' {:4d}'.format(year)
+            params += " {:4d}".format(year)

-        logger.debug('Searching subtitles %r', params)
+        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(self.search_url).format(params)

@ -86,45 +146,33 @@ class ZimukuProvider(Provider):
        r.raise_for_status()

        if not r.content:
-            logger.debug('No data returned from provider')
+            logger.debug("No data returned from provider")
            return []

-        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
-
-        for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'):
-            moviename = entity.text
-            entity_url = self.server_url + entity['href']
-            logger.debug(entity_url)
-            r = self.session.get(entity_url, timeout=30)
-            r.raise_for_status()
-            logger.debug('looking into ' + entity_url)
-
-            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("div", class_="subs box clearfix")
-            # loop over subtitles cells
-
-            subs = soup.tbody.find_all("tr")
-            for sub in subs:
-                page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8'))
-                version = sub.a.text.encode('utf-8') or None
-                if version is None:
-                    version = ""
-                try:
-                    td = sub.find("td", class_="tac lang")
-                    r2 = td.find_all("img")
-                    langs = [x.get('title').encode('utf-8') for x in r2]
-                except:
-                    langs = '未知' 
-                name = '%s (%s)' % (version, ",".join(langs))
-
-                if ('English' in langs) and not(('简体中文' in langs) or ('繁體中文' in langs)):
-                    language = Language('eng')
+        soup = ParserBeautifulSoup(
+            r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]
+        )
+
+        # non-shooter result page
+        if soup.find("div", {"class": "item"}):
+            logger.debug("enter a non-shooter page")
+            for item in soup.find_all("div", {"class": "item"}):
+                title_a = item.find("p", class_="tt clearfix").find("a")
+                if season:
+                    title = title_a.text
+                    season_cn1 = re.search("第(.*)季", title)
+                    if not season_cn1:
+                        season_cn1 = "一"
                    else:
-                    language = Language('zho')
-                # read the item
-                subtitle = self.subtitle_class(language, page_link, version, page_link.replace("detail","dld"))
+                        season_cn1 = season_cn1.group(1).strip()
+                    season_cn2 = num_to_cn(str(season))
+                    if season_cn1 != season_cn2:
+                        continue
+                episode_link = self.server_url + title_a.attrs["href"]
+                new_subs = self._parse_episode_page(episode_link)
+                subtitles += new_subs

-                logger.debug('Found subtitle %r', subtitle)
-                subtitles.append(subtitle)
+        # NOTE: shooter result pages are ignored due to the existence of assrt provider

        return subtitles

@ -140,70 +188,174 @@ class ZimukuProvider(Provider):
        # query for subtitles with the show_id
        for title in titles:
            if isinstance(video, Episode):
-                subtitles += [s for s in self.query(title, season=video.season, episode=video.episode,
-                                                    year=video.year)
-                              if s.language in languages]
+                subtitles += [
+                    s
+                    for s in self.query(
+                        title,
+                        season=video.season,
+                        episode=video.episode,
+                        year=video.year,
+                    )
+                    if s.language in languages
+                ]
            elif isinstance(video, Movie):
-                subtitles += [s for s in self.query(title, year=video.year)
-                              if s.language in languages]
+                subtitles += [
+                    s
+                    for s in self.query(title, year=video.year)
+                    if s.language in languages
+                ]

        return subtitles

    def download_subtitle(self, subtitle):
-        if isinstance(subtitle, ZimukuSubtitle):
+        def _get_archive_dowload_link(session, sub_page_link):
+            r = session.get(sub_page_link)
+            bs_obj = ParserBeautifulSoup(
+                r.content.decode("utf-8", "ignore"), ["html.parser"]
+            )
+            down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"]
+            down_page_link = urljoin(sub_page_link, down_page_link)
+            r = session.get(down_page_link)
+            bs_obj = ParserBeautifulSoup(
+                r.content.decode("utf-8", "ignore"), ["html.parser"]
+            )
+            download_link = bs_obj.find("a", {"rel": "nofollow"})
+            download_link = download_link.attrs["href"]
+            download_link = urljoin(sub_page_link, download_link)
+            return download_link
+
        # download the subtitle
-            logger.info('Downloading subtitle %r', subtitle)
-            r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link},
-                                 timeout=30)
+        logger.info("Downloading subtitle %r", subtitle)
+        self.session = subtitle.session
+        download_link = _get_archive_dowload_link(self.session, subtitle.page_link)
+        r = self.session.get(download_link, timeout=30)
        r.raise_for_status()
+        filename = r.headers["Content-Disposition"]

        if not r.content:
-                logger.debug('Unable to download subtitle. No data returned from provider')
+            logger.debug("Unable to download subtitle. No data returned from provider")
            return

-            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
-            links = soup.find("div", {"class":"clearfix"}).find_all('a')
-            # TODO: add settings for choice
-
-            for down_link in links:
-                url = down_link.get('href').encode('utf-8')
-                url = self.server_url + url
-                r = self.session.get(url, headers={'Referer': subtitle.download_link},
-                                 timeout=30)
-                r.raise_for_status()
-
-                if len(r.content) > 1024:
-                    break
-
        archive_stream = io.BytesIO(r.content)
        archive = None
        if rarfile.is_rarfile(archive_stream):
-                logger.debug('Identified rar archive')
+            logger.debug("Identified rar archive")
+            if ".rar" not in filename:
+                logger.debug(
+                    ".rar should be in the downloaded file name: {}".format(filename)
+                )
+                return
            archive = rarfile.RarFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        elif zipfile.is_zipfile(archive_stream):
-                logger.debug('Identified zip archive')
+            logger.debug("Identified zip archive")
+            if ".zip" not in filename:
+                logger.debug(
+                    ".zip should be in the downloaded file name: {}".format(filename)
+                )
+                return
            archive = zipfile.ZipFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        else:
+            is_sub = ""
+            for sub_ext in SUBTITLE_EXTENSIONS:
+                if sub_ext in filename:
+                    is_sub = sub_ext
+                    break
+            if not is_sub:
+                logger.debug(
+                    "unknown subtitle ext int downloaded file name: {}".format(filename)
+                )
+                return
+            logger.debug("Identified {} file".format(is_sub))
            subtitle_content = r.content

        if subtitle_content:
            subtitle.content = fix_line_ending(subtitle_content)
        else:
-                logger.debug('Could not extract subtitle from %r', archive)
+            logger.debug("Could not extract subtitle from %r", archive)


 def _get_subtitle_from_archive(archive):
-    for name in archive.namelist():
+    extract_subname, max_score = "", -1
+
+    for subname in archive.namelist():
        # discard hidden files
-        if os.path.split(name)[-1].startswith('.'):
+        if os.path.split(subname)[-1].startswith("."):
            continue

        # discard non-subtitle files
-        if not name.lower().endswith(SUBTITLE_EXTENSIONS):
+        if not subname.lower().endswith(SUBTITLE_EXTENSIONS):
            continue

-        return archive.read(name)
-
-    return None
+        # prefer ass/ssa subtitles with double languages or simplified chinese
+        score = ("ass" in subname or "ssa" in subname) * 1
+        if "简体" in subname or "chs" in subname or ".gb." in subname:
+            score += 2
+        if "繁体" in subname or "cht" in subname or ".big5." in subname:
+            pass
+        if "chs.eng" in subname or "chs&eng" in subname:
+            score += 2
+        if "中英" in subname or "简英" in subname or "双语" in subname or "简体&英文" in subname:
+            score += 4
+        logger.debug("subtitle {}, score: {}".format(subname, score))
+        if score > max_score:
+            max_score = score
+            extract_subname = subname
+
+    return archive.read(extract_subname) if max_score != -1 else None
+
+
+def _extract_name(name):
+    """ filter out Chinese characters from subtitle names """
+    name, suffix = os.path.splitext(name)
+    c_pattern = "[\u4e00-\u9fff]"
+    e_pattern = "[a-zA-Z]"
+    c_indices = [m.start(0) for m in re.finditer(c_pattern, name)]
+    e_indices = [m.start(0) for m in re.finditer(e_pattern, name)]
+
+    target, discard = e_indices, c_indices
+
+    if len(target) == 0:
+        return ""
+
+    first_target, last_target = target[0], target[-1]
+    first_discard = discard[0] if discard else -1
+    last_discard = discard[-1] if discard else -1
+    if last_discard < first_target:
+        new_name = name[first_target:]
+    elif last_target < first_discard:
+        new_name = name[:first_discard]
+    else:
+        # try to find maximum continous part
+        result, start, end = [0, 1], -1, 0
+        while end < len(name):
+            while end not in e_indices and end < len(name):
+                end += 1
+            if end == len(name):
+                break
+            start = end
+            while end not in c_indices and end < len(name):
+                end += 1
+            if end - start > result[1] - result[0]:
+                result = [start, end]
+                print(result)
+            start = end
+            end += 1
+        new_name = name[result[0] : result[1]]
+    new_name = new_name.strip() + suffix
+    return new_name
+
+
+def num_to_cn(number):
+    """ convert numbers(1-99) to Chinese """
+    assert number.isdigit() and 1 <= int(number) <= 99
+
+    trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))}
+
+    if len(number) == 1:
+        return trans_map[number]
+    else:
+        part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十"
+        part2 = trans_map[number[1]] if number[1] != "0" else ""
+        return part1 + part2