From 618bddebf97fd962a89a6c9d1f11c19733fbb4d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Bart=C3=ADk?= <63553146+sambartik@users.noreply.github.com> Date: Mon, 1 Nov 2021 04:35:29 +0100 Subject: [PATCH] added more improvement to Titulki provider --- libs/subliminal_patch/providers/titulky.py | 150 ++++++++++++--------- 1 file changed, 84 insertions(+), 66 deletions(-) diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py index 8afd6725d..65953f6ce 100644 --- a/libs/subliminal_patch/providers/titulky.py +++ b/libs/subliminal_patch/providers/titulky.py @@ -4,7 +4,6 @@ from __future__ import absolute_import import io import logging import math -import os import re import zipfile from random import randint @@ -15,13 +14,19 @@ import rarfile from guessit import guessit from requests import Session from requests.adapters import HTTPAdapter -from subliminal import __short_version__ +from requests.exceptions import HTTPError + from subliminal.exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, Error, ProviderError -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending +from subliminal.providers import ParserBeautifulSoup +from subliminal.subtitle import fix_line_ending from subliminal.video import Episode, Movie + +from subliminal_patch.exceptions import ParseResponseError +from subliminal_patch.providers import Provider +from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin from subliminal_patch.score import framerate_equal -from subliminal_patch.subtitle import guess_matches, sanitize +from subliminal_patch.subtitle import Subtitle, guess_matches, sanitize + from subzero.language import Language from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST @@ -36,26 +41,29 @@ class TitulkySubtitle(Subtitle): hash_verifiable = False hearing_impaired_verifiable = False - def __init__(self, sub_id, language, names, season, episode, year, release_info, fps, uploader, approved, page_link, download_link, skip_wrong_fps=False): + def __init__(self, sub_id, imdb_id, language, names, season, episode, year, releases, fps, uploader, approved, page_link, download_link, skip_wrong_fps=False, asked_for_episode=None): super().__init__(language, page_link=page_link) self.names = names self.year = year self.sub_id = sub_id + self.imdb_id = imdb_id self.fps = fps self.season = season self.episode = episode - self.release_info = release_info + self.releases = releases + self.release_info = ', '.join(releases) self.language = language self.approved = approved self.page_link = page_link self.uploader = uploader self.download_link = download_link self.skip_wrong_fps = skip_wrong_fps + self.asked_for_episode = asked_for_episode self.matches = None - + # Try to parse S00E00 string from the main subtitle name - season_episode_string = re.findall('S(\d+)E(\d+)', self.names[0], re.IGNORECASE) + season_episode_string = re.findall(r'S(\d+)E(\d+)', self.names[0], re.IGNORECASE) # If we did not search for subtitles with season and episode numbers in search query, # try to parse it from the main subtitle name that most likely contains it @@ -79,7 +87,11 @@ class TitulkySubtitle(Subtitle): if _type == 'episode': ## EPISODE - + + # match imdb_id of a series + if video.series_imdb_id and video.series_imdb_id == self.imdb_id: + matches.add('series_imdb_id') + # match season/episode if self.season and self.season == video.season: matches.add('season') @@ -94,11 +106,15 @@ class TitulkySubtitle(Subtitle): # match episode title episode_titles = [video.title] if _contains_element(_from=episode_titles, _in=self.names): - matches.add('title') + matches.add('episode_title') elif _type == 'movie': ## MOVIE + # match imdb_id of a movie + if video.imdb_id and video.imdb_id == self.imdb_id: + matches.add('imdb_id') + # match movie title video_titles = [video.title] + video.alternative_titles if _contains_element(_from=video_titles, _in=self.names): @@ -110,8 +126,9 @@ class TitulkySubtitle(Subtitle): if video.year and video.year == self.year: matches.add('year') - # match other properties based on release info - matches |= guess_matches(video, guessit(self.release_info, {"type": _type})) + # match other properties based on release infos + for release in self.releases: + matches |= guess_matches(video, guessit(release, {"type": _type})) # If turned on in settings, then do not match if video FPS is not equal to subtitle FPS if self.skip_wrong_fps and video.fps and self.fps and not framerate_equal(video.fps, self.fps): @@ -123,7 +140,7 @@ class TitulkySubtitle(Subtitle): return matches -class TitulkyProvider(Provider): +class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): """Titulky.com provider""" languages = {Language(l) for l in ['ces', 'slk']} @@ -220,7 +237,7 @@ class TitulkyProvider(Provider): res = self.session.get(url, timeout=self.timeout) if res.status_code != 200: - raise ProviderError(f"Fetch failed with status code {res.status_code}") + raise HTTPError(f"Fetch failed with status code {res.status_code}") if not res.text: raise ProviderError("No response returned from the provider") @@ -235,7 +252,7 @@ class TitulkyProvider(Provider): for key, value in params.items(): result += f'{key}={value}&' - # Remove last & + # Remove the last & result = result[:-1] # Remove spaces @@ -243,23 +260,34 @@ class TitulkyProvider(Provider): return result - # Parse details of an individual subtitle: release, language, uploader, fps and year + # Parse details of an individual subtitle: imdb_id, release, language, uploader, fps and year def parse_details(self, url): html_src = self.fetch_page(url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) details_container = details_page_soup.find('div', class_='detail') if not details_container: - # The subtitles were removed and got redirected to a different page. Better treat this silently. + # The subtitles could be removed and got redirected to a different page. Better treat this silently. logger.debug("Titulky.com: Could not find details div container. Skipping.") return False + ### IMDB ID + imdb_id = None + imdb_tag = details_container.find('a', attrs={'target': 'imdb'}) + + if imdb_tag: + imdb_url = imdb_tag.get('href') + imdb_id = re.findall(r'tt(\d+)', imdb_url)[0] + + if not imdb_id: + logger.debug("Titulky.com: No IMDB ID supplied on details page.") + ### RELEASE release = None release_tag = details_container.find('div', class_='releas') if not release_tag: - raise Error("Could not find release tag. Did the HTML source change?") + raise ParseResponseError("Could not find release tag. Did the HTML source change?") release = release_tag.get_text(strip=True) @@ -284,12 +312,12 @@ class TitulkyProvider(Provider): uploader_tag = details_container.find('div', class_='ulozil') if not uploader_tag: - raise Error("Could not find uploader tag. Did the HTML source change?") + raise ParseResponseError("Could not find uploader tag. Did the HTML source change?") uploader_anchor_tag = uploader_tag.find('a') if not uploader_anchor_tag: - raise Error("Could not find uploader anchor tag. Did the HTML source change?") + raise ParseResponseError("Could not find uploader anchor tag. Did the HTML source change?") uploader = uploader_anchor_tag.string.strip() if uploader_anchor_tag else None @@ -301,11 +329,11 @@ class TitulkyProvider(Provider): fps_icon_tag_selection = details_container.select('img[src*=\'Movieroll\']') if not fps_icon_tag_selection and not hasattr(fps_icon_tag_selection[0], 'parent'): - raise Error("Could not find parent of the fps icon tag. Did the HTML source change?") + raise ParseResponseError("Could not find parent of the fps icon tag. Did the HTML source change?") fps_icon_tag = fps_icon_tag_selection[0] parent_text = fps_icon_tag.parent.get_text(strip=True) - match = re.findall('(\d+,\d+) fps', parent_text) + match = re.findall(r'(\d+,\d+) fps', parent_text) # If the match is found, change the decimal separator to a dot and convert to float fps = float(match[0].replace(',', '.')) if len(match) > 0 else None @@ -318,7 +346,7 @@ class TitulkyProvider(Provider): h1_tag = details_container.find('h1', id='titulky') if not h1_tag: - raise Error("Could not find h1 tag. Did the HTML source change?") + raise ParseResponseError("Could not find h1 tag. Did the HTML source change?") # The h1 tag contains the name of the subtitle and a year h1_texts = [text for text in h1_tag.stripped_strings] @@ -334,11 +362,12 @@ class TitulkyProvider(Provider): # Return the subtitle details return { - 'release': release, + 'releases': [release], 'language': language, 'uploader': uploader, 'fps': fps, - 'year': year + 'year': year, + 'imdb_id': imdb_id } def process_row(self, row, video_names, thread_id=None, threads_data=None): @@ -347,7 +376,7 @@ class TitulkyProvider(Provider): anchor_tag = row.find_all('a')[1] # The details link is relative, so we need to remove the dot at the beginning details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}" - id_match = re.findall('id=(\d+)', details_link) + id_match = re.findall(r'id=(\d+)', details_link) sub_id = id_match[0] if len(id_match) > 0 else None download_link = f"{self.download_url}{sub_id}" @@ -492,13 +521,13 @@ class TitulkyProvider(Provider): table = search_page_soup.find('table', class_='table') if not table: logger.debug("Titulky.com: Could not find table") - raise Error("Could not find table. Did the HTML source change?") + raise ParseResponseError("Could not find table. Did the HTML source change?") # Get table body containing rows of subtitles table_body = table.find('tbody') if not table_body: logger.debug("Titulky.com: Could not find table body") - raise Error("Could not find table body. Did the HTML source change?") + raise ParseResponseError("Could not find table body. Did the HTML source change?") ## Loop over all subtitles on the first page and put them in a list subtitles = [] @@ -514,9 +543,12 @@ class TitulkyProvider(Provider): # and we can instationate it and add it to the list if sub_info: logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}") + + # If we found the subtitle by IMDB ID, no need to get it from details page + sub_imdb_id = imdb_id or sub_info['imdb_id'] - subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['release'], sub_info['fps'], - sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps) + subtitle_instance = self.subtitle_class(sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], + sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # No subtitle info was returned, i. e. something unexpected @@ -558,7 +590,7 @@ class TitulkyProvider(Provider): # If the thread returned didn't return anything, but expected a dict object if not thread_data: - raise Error(f"No data returned from thread ID: {i}") + raise ProviderError(f"No data returned from thread ID: {i}") # If an exception was raised in a thread, raise it again here if 'exception' in thread_data and thread_data['exception']: @@ -571,8 +603,11 @@ class TitulkyProvider(Provider): logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}") sub_info = thread_data['sub_info'] - subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['release'], sub_info['fps'], - sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps) + # If we found the subtitle by IMDB ID, no need to get it from details page + sub_imdb_id = imdb_id or sub_info['imdb_id'] + + subtitle_instance = self.subtitle_class(sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], + sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected @@ -639,51 +674,34 @@ class TitulkyProvider(Provider): return subtitles -# The rest is mostly old code from original implementation. Might want to redo it. def download_subtitle(self, subtitle): res = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, - timeout=self.timeout) - res.raise_for_status() + timeout=self.timeout) + + try: + res.raise_for_status() + except: + raise HTTPError(f"An error occured during the download request to {subtitle.download_link}") archive_stream = io.BytesIO(res.content) archive = None if rarfile.is_rarfile(archive_stream): logger.debug("Titulky.com: Identified rar archive") archive = rarfile.RarFile(archive_stream) - subtitle_content = _get_subtitle_from_archive(archive) + subtitle_content = self.get_subtitle_from_archive(subtitle, archive) elif zipfile.is_zipfile(archive_stream): logger.debug("Titulky.com: Identified zip archive") archive = zipfile.ZipFile(archive_stream) - subtitle_content = _get_subtitle_from_archive(archive) - else: - subtitle_content = res.content - - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - return subtitle_content + subtitle_content = self.get_subtitle_from_archive(subtitle, archive) else: - logger.debug(f"Titulky.com: Could not extract subtitle from {archive}") - -def _get_subtitle_from_archive(archive): - if '_info.txt' in archive.namelist(): - info_content_binary = archive.read('_info.txt') - info_content = info_content_binary.decode(chardet.detect(info_content_binary)['encoding']) - if "nestaženo - překročen limit" in info_content: - raise DownloadLimitExceeded("The download limit has been exceeded") - - for name in archive.namelist(): - # discard hidden files - if os.path.split(name)[-1].startswith('.'): - continue + subtitle_content = fix_line_ending(res.content) - # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): - continue + if not subtitle_content: + logger.debug("Titulky.com: No subtitle content found. The downloading limit has been most likely exceeded.") + raise DownloadLimitExceeded("Subtitles download limit has been exceeded") + + subtitle.content = subtitle_content - return archive.read(name) - - return None - # Check if any element from source array is **contained** in any element from target array # Returns on the first match def _contains_element(_from=None, _in=None): @@ -695,4 +713,4 @@ def _contains_element(_from=None, _in=None): if sanitize(source) in sanitize(target): return True - return False \ No newline at end of file + return False