diff --git a/libs/subliminal_patch/providers/zimuku.py b/libs/subliminal_patch/providers/zimuku.py index 0dad58cde..5e3b917e3 100644 --- a/libs/subliminal_patch/providers/zimuku.py +++ b/libs/subliminal_patch/providers/zimuku.py @@ -4,6 +4,13 @@ import io import logging import os import zipfile +import re +import copy + +try: + from urlparse import urljoin +except ImportError: + from urllib.parse import urljoin import rarfile from subzero.language import Language @@ -13,7 +20,12 @@ from six import text_type from subliminal import __short_version__ from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.subtitle import ( + SUBTITLE_EXTENSIONS, + Subtitle, + fix_line_ending, + guess_matches, +) from subliminal.video import Episode, Movie logger = logging.getLogger(__name__) @@ -21,43 +33,50 @@ logger = logging.getLogger(__name__) class ZimukuSubtitle(Subtitle): """Zimuku Subtitle.""" - provider_name = 'zimuku' - def __init__(self, language, page_link, version, download_link): + provider_name = "zimuku" + + def __init__(self, language, page_link, version, session): super(ZimukuSubtitle, self).__init__(language, page_link=page_link) self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'utf-8' + self.hearing_impaired = False + self.encoding = "utf-8" + self.session = session @property def id(self): - return self.download_link + return self.version def get_matches(self, video): matches = set() # episode if isinstance(video, Episode): + # always make year a match + info = guessit(self.version, {"type": "episode"}) + info["year"] = video.year # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) + matches |= guess_matches(video, info, partial=True) # movie elif isinstance(video, Movie): # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) + matches |= guess_matches( + video, guessit(self.version, {"type": "movie"}), partial=True + ) return matches class ZimukuProvider(Provider): """Zimuku Provider.""" - languages = {Language(l) for l in ['zho', 'eng']} - server_url = 'http://www.zimuku.la' - search_url = '/search?q={}' - download_url = 'http://www.zimuku.la/' - - UserAgent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)' + languages = {Language(l) for l in ["zho", "eng"]} + + server_url = "http://www.zimuku.la" + search_url = "/search?q={}" + download_url = "http://www.zimuku.la/" + + UserAgent = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)" subtitle_class = ZimukuSubtitle @@ -66,19 +85,52 @@ class ZimukuProvider(Provider): def initialize(self): self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + self.session.headers["User-Agent"] = "Subliminal/{}".format(__short_version__) def terminate(self): self.session.close() + def _parse_episode_page(self, link): + r = self.session.get(link) + bs_obj = ParserBeautifulSoup( + r.content.decode("utf-8", "ignore"), ["html.parser"] + ) + subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody") + subs = [] + for sub in subs_body.find_all("tr"): + a = sub.find("a") + name = _extract_name(a.text) + name = os.path.splitext(name)[ + 0 + ] # remove ext because it can be an archive type + + language = Language("eng") + for img in sub.find("td", class_="tac lang").find_all("img"): + if ( + "hongkong" in img.attrs["src"] + or "china" in img.attrs["src"] + or "jollyroger" in img.attrs["src"] + ): + language = Language("zho") + break + sub_page_link = urljoin(self.server_url, a.attrs["href"]) + backup_session = copy.deepcopy(self.session) + backup_session.headers["Referer"] = link + + subs.append( + self.subtitle_class(language, sub_page_link, name, backup_session) + ) + + return subs + def query(self, keyword, season=None, episode=None, year=None): params = keyword - if season and episode: - params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) + if season: + params += ".S{season:02d}".format(season=season) elif year: - params += ' {:4d}'.format(year) + params += " {:4d}".format(year) - logger.debug('Searching subtitles %r', params) + logger.debug("Searching subtitles %r", params) subtitles = [] search_link = self.server_url + text_type(self.search_url).format(params) @@ -86,45 +138,33 @@ class ZimukuProvider(Provider): r.raise_for_status() if not r.content: - logger.debug('No data returned from provider') + logger.debug("No data returned from provider") return [] - soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) - - for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'): - moviename = entity.text - entity_url = self.server_url + entity['href'] - logger.debug(entity_url) - r = self.session.get(entity_url, timeout=30) - r.raise_for_status() - logger.debug('looking into ' + entity_url) - - soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("div", class_="subs box clearfix") - # loop over subtitles cells - - subs = soup.tbody.find_all("tr") - for sub in subs: - page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8')) - version = sub.a.text.encode('utf-8') or None - if version is None: - version = "" - try: - td = sub.find("td", class_="tac lang") - r2 = td.find_all("img") - langs = [x.get('title').encode('utf-8') for x in r2] - except: - langs = '未知' - name = '%s (%s)' % (version, ",".join(langs)) - - if ('English' in langs) and not(('简体中文' in langs) or ('繁體中文' in langs)): - language = Language('eng') - else: - language = Language('zho') - # read the item - subtitle = self.subtitle_class(language, page_link, version, page_link.replace("detail","dld")) - - logger.debug('Found subtitle %r', subtitle) - subtitles.append(subtitle) + soup = ParserBeautifulSoup( + r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"] + ) + + # non-shooter result page + if soup.find("div", {"class": "item"}): + logger.debug("enter a non-shooter page") + for item in soup.find_all("div", {"class": "item"}): + title_a = item.find("p", class_="tt clearfix").find("a") + if season: + title = title_a.text + season_cn1 = re.search("第(.*)季", title) + if not season_cn1: + season_cn1 = "一" + else: + season_cn1 = season_cn1.group(1).strip() + season_cn2 = num_to_cn(str(season)) + if season_cn1 != season_cn2: + continue + episode_link = self.server_url + title_a.attrs["href"] + new_subs = self._parse_episode_page(episode_link) + subtitles += new_subs + + # NOTE: shooter result pages are ignored due to the existence of assrt provider return subtitles @@ -140,70 +180,174 @@ class ZimukuProvider(Provider): # query for subtitles with the show_id for title in titles: if isinstance(video, Episode): - subtitles += [s for s in self.query(title, season=video.season, episode=video.episode, - year=video.year) - if s.language in languages] + subtitles += [ + s + for s in self.query( + title, + season=video.season, + episode=video.episode, + year=video.year, + ) + if s.language in languages + ] elif isinstance(video, Movie): - subtitles += [s for s in self.query(title, year=video.year) - if s.language in languages] + subtitles += [ + s + for s in self.query(title, year=video.year) + if s.language in languages + ] return subtitles def download_subtitle(self, subtitle): - if isinstance(subtitle, ZimukuSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, - timeout=30) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) - links = soup.find("div", {"class":"clearfix"}).find_all('a') - # TODO: add settings for choice - - for down_link in links: - url = down_link.get('href').encode('utf-8') - url = self.server_url + url - r = self.session.get(url, headers={'Referer': subtitle.download_link}, - timeout=30) - r.raise_for_status() + def _get_archive_dowload_link(session, sub_page_link): + r = session.get(sub_page_link) + bs_obj = ParserBeautifulSoup( + r.content.decode("utf-8", "ignore"), ["html.parser"] + ) + down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] + down_page_link = urljoin(sub_page_link, down_page_link) + r = session.get(down_page_link) + bs_obj = ParserBeautifulSoup( + r.content.decode("utf-8", "ignore"), ["html.parser"] + ) + download_link = bs_obj.find("a", {"rel": "nofollow"}) + download_link = download_link.attrs["href"] + download_link = urljoin(sub_page_link, download_link) + return download_link + + # download the subtitle + logger.info("Downloading subtitle %r", subtitle) + self.session = subtitle.session + download_link = _get_archive_dowload_link(self.session, subtitle.page_link) + r = self.session.get(download_link, timeout=30) + r.raise_for_status() + filename = r.headers["Content-Disposition"] - if len(r.content) > 1024: + if not r.content: + logger.debug("Unable to download subtitle. No data returned from provider") + return + + archive_stream = io.BytesIO(r.content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug("Identified rar archive") + if ".rar" not in filename: + logger.debug( + ".rar should be in the downloaded file name: {}".format(filename) + ) + return + archive = rarfile.RarFile(archive_stream) + subtitle_content = _get_subtitle_from_archive(archive) + elif zipfile.is_zipfile(archive_stream): + logger.debug("Identified zip archive") + if ".zip" not in filename: + logger.debug( + ".zip should be in the downloaded file name: {}".format(filename) + ) + return + archive = zipfile.ZipFile(archive_stream) + subtitle_content = _get_subtitle_from_archive(archive) + else: + is_sub = "" + for sub_ext in SUBTITLE_EXTENSIONS: + if sub_ext in filename: + is_sub = sub_ext break + if not is_sub: + logger.debug( + "unknown subtitle ext int downloaded file name: {}".format(filename) + ) + return + logger.debug("Identified {} file".format(is_sub)) + subtitle_content = r.content - archive_stream = io.BytesIO(r.content) - archive = None - if rarfile.is_rarfile(archive_stream): - logger.debug('Identified rar archive') - archive = rarfile.RarFile(archive_stream) - subtitle_content = _get_subtitle_from_archive(archive) - elif zipfile.is_zipfile(archive_stream): - logger.debug('Identified zip archive') - archive = zipfile.ZipFile(archive_stream) - subtitle_content = _get_subtitle_from_archive(archive) - else: - subtitle_content = r.content - - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - else: - logger.debug('Could not extract subtitle from %r', archive) + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug("Could not extract subtitle from %r", archive) def _get_subtitle_from_archive(archive): - for name in archive.namelist(): + extract_subname, max_score = "", -1 + + for subname in archive.namelist(): # discard hidden files - if os.path.split(name)[-1].startswith('.'): + if os.path.split(subname)[-1].startswith("."): continue # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): + if not subname.lower().endswith(SUBTITLE_EXTENSIONS): continue - return archive.read(name) - - return None + # prefer ass/ssa subtitles with double languages or simplified chinese + score = ("ass" in subname or "ssa" in subname) * 1 + if "简体" in subname or "chs" in subname or ".gb." in subname: + score += 2 + if "繁体" in subname or "cht" in subname or ".big5." in subname: + pass + if "chs.eng" in subname or "chs&eng" in subname: + score += 2 + if "中英" in subname or "简英" in subname or "双语" in subname or "简体&英文" in subname: + score += 4 + logger.debug("subtitle {}, score: {}".format(subname, score)) + if score > max_score: + max_score = score + extract_subname = subname + + return archive.read(extract_subname) if max_score != -1 else None + + +def _extract_name(name): + """ filter out Chinese characters from subtitle names """ + name, suffix = os.path.splitext(name) + c_pattern = "[\u4e00-\u9fff]" + e_pattern = "[a-zA-Z]" + c_indices = [m.start(0) for m in re.finditer(c_pattern, name)] + e_indices = [m.start(0) for m in re.finditer(e_pattern, name)] + + target, discard = e_indices, c_indices + + if len(target) == 0: + return "" + + first_target, last_target = target[0], target[-1] + first_discard = discard[0] if discard else -1 + last_discard = discard[-1] if discard else -1 + if last_discard < first_target: + new_name = name[first_target:] + elif last_target < first_discard: + new_name = name[:first_discard] + else: + # try to find maximum continous part + result, start, end = [0, 1], -1, 0 + while end < len(name): + while end not in e_indices and end < len(name): + end += 1 + if end == len(name): + break + start = end + while end not in c_indices and end < len(name): + end += 1 + if end - start > result[1] - result[0]: + result = [start, end] + print(result) + start = end + end += 1 + new_name = name[result[0] : result[1]] + new_name = new_name.strip() + suffix + return new_name + + +def num_to_cn(number): + """ convert numbers(1-99) to Chinese """ + assert number.isdigit() and 1 <= int(number) <= 99 + + trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))} + + if len(number) == 1: + return trans_map[number] + else: + part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十" + part2 = trans_map[number[1]] if number[1] != "0" else "" + return part1 + part2