From 5c8434352774184811ccc1af1ebeb31555121af8 Mon Sep 17 00:00:00 2001 From: Halali Date: Thu, 4 Apr 2019 08:25:59 +0200 Subject: [PATCH 01/19] Change upgrade subtitles notifications. --- bazarr/get_subtitle.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index 1dd71df4d..d2a088397 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -766,11 +766,10 @@ def upgrade_subtitles(): providers_list = get_providers() providers_auth = get_providers_auth() - for episode in episodes_to_upgrade: + for i, episode in enumerate(episodes_to_upgrade, 1): if episode[1] in ast.literal_eval(str(episode[9])): - notifications.write( - msg='Searching to upgrade ' + str(language_from_alpha2(episode[1])) + ' subtitles for this episode: ' + - path_replace(episode[0]), queue='get_subtitle') + notifications.write(msg='Upgrading series subtitles : ' + str(i) + '/' + str(len(episodes_to_upgrade)), + queue='get_subtitle') result = download_subtitle(path_replace(episode[0]), str(alpha3_from_alpha2(episode[1])), episode[3], providers_list, providers_auth, str(episode[4]), episode[5], 'series', forced_minimum_score=int(episode[2]), is_upgrade=True) @@ -784,11 +783,10 @@ def upgrade_subtitles(): history_log(3, episode[6], episode[7], message, path, language_code, provider, score) send_notifications(episode[6], episode[7], message) - for movie in movies_to_upgrade: + for i, movie in enumerate(movies_to_upgrade, 1): if movie[1] in ast.literal_eval(str(movie[8])): - notifications.write( - msg='Searching to upgrade ' + str(language_from_alpha2(movie[1])) + ' subtitles for this movie: ' + - path_replace_movie(movie[0]), queue='get_subtitle') + notifications.write(msg='Upgrading movie subtitles : ' + str(i) + '/' + str(len(movies_to_upgrade)), + queue='get_subtitle') result = download_subtitle(path_replace_movie(movie[0]), str(alpha3_from_alpha2(movie[1])), movie[3], providers_list, providers_auth, str(movie[4]), movie[5], 'movie', forced_minimum_score=int(movie[2]), is_upgrade=True) From 59e751a4c340b62928134779a703daaa6f5e0016 Mon Sep 17 00:00:00 2001 From: Halali Date: Thu, 4 Apr 2019 15:21:30 +0200 Subject: [PATCH 02/19] Add logger file handler before setLevels --- bazarr/logger.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bazarr/logger.py b/bazarr/logger.py index 3507767bd..e95d2259f 100644 --- a/bazarr/logger.py +++ b/bazarr/logger.py @@ -67,6 +67,8 @@ def configure_logging(debug=False): fh.setFormatter(f) fh.addFilter(BlacklistFilter()) fh.addFilter(PublicIPFilter()) + fh.setLevel(log_level) + logger.addHandler(fh) if debug: logging.getLogger("apscheduler").setLevel(logging.DEBUG) @@ -90,8 +92,7 @@ def configure_logging(debug=False): logging.getLogger("rebulk").setLevel(logging.WARNING) logging.getLogger("stevedore.extension").setLevel(logging.CRITICAL) logging.getLogger("geventwebsocket.handler").setLevel(logging.WARNING) - fh.setLevel(log_level) - logger.addHandler(fh) + class MyFilter(logging.Filter): From e2500c9f84bdf2d772a5ab0643884a96c0be69d1 Mon Sep 17 00:00:00 2001 From: Halali Date: Thu, 4 Apr 2019 15:29:08 +0200 Subject: [PATCH 03/19] Add long duration notification --- bazarr/get_subtitle.py | 11 +++++++---- views/menu.tpl | 23 +++++++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index d2a088397..1e617e020 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -765,11 +765,14 @@ def upgrade_subtitles(): providers_list = get_providers() providers_auth = get_providers_auth() + + count_episode_to_upgrade = len(episodes_to_upgrade) + count_movie_to_upgrade = len(movies_to_upgrade) for i, episode in enumerate(episodes_to_upgrade, 1): if episode[1] in ast.literal_eval(str(episode[9])): - notifications.write(msg='Upgrading series subtitles : ' + str(i) + '/' + str(len(episodes_to_upgrade)), - queue='get_subtitle') + notifications.write(msg='Upgrading series subtitles : ' + str(i) + '/' + str(count_episode_to_upgrade), + queue='get_subtitle', duration='long') result = download_subtitle(path_replace(episode[0]), str(alpha3_from_alpha2(episode[1])), episode[3], providers_list, providers_auth, str(episode[4]), episode[5], 'series', forced_minimum_score=int(episode[2]), is_upgrade=True) @@ -785,8 +788,8 @@ def upgrade_subtitles(): for i, movie in enumerate(movies_to_upgrade, 1): if movie[1] in ast.literal_eval(str(movie[8])): - notifications.write(msg='Upgrading movie subtitles : ' + str(i) + '/' + str(len(movies_to_upgrade)), - queue='get_subtitle') + notifications.write(msg='Upgrading movie subtitles : ' + str(i) + '/' + str(count_movie_to_upgrade), + queue='get_subtitle', duration='long') result = download_subtitle(path_replace_movie(movie[0]), str(alpha3_from_alpha2(movie[1])), movie[3], providers_list, providers_auth, str(movie[4]), movie[5], 'movie', forced_minimum_score=int(movie[2]), is_upgrade=True) diff --git a/views/menu.tpl b/views/menu.tpl index 4dfdc9f1b..62efb2943 100644 --- a/views/menu.tpl +++ b/views/menu.tpl @@ -244,17 +244,20 @@ url: url_notifications, success: function (data) { if (data !== "") { - data = JSON.parse(data); - var msg = data[0]; - var type = data[1]; - var duration = data[2]; - var button = data[3]; - var queue = data[4]; + data = JSON.parse(data); + var msg = data[0]; + var type = data[1]; + var duration = data[2]; + var button = data[3]; + var queue = data[4]; - if (duration === 'temporary') { - timeout = 3000; - killer = queue; - } else { + if (duration === 'temporary') { + timeout = 3000; + killer = queue; + } else if (duration === 'long') { + timeout = 15000; + killer = queue; + } else { timeout = false; killer = false; } From 6c4c124ae434a812f5291858297edb20cb3b7adf Mon Sep 17 00:00:00 2001 From: panni Date: Thu, 4 Apr 2019 17:01:37 +0200 Subject: [PATCH 04/19] core: update to subliminal_patch:head; fix subscene; add alternative titles support to subscene and opensubtitles --- libs/subliminal_patch/core.py | 10 +++ libs/subliminal_patch/http.py | 14 +++- .../providers/opensubtitles.py | 29 ++++++-- libs/subliminal_patch/providers/subscene.py | 71 +++++++++++++------ libs/subliminal_patch/subtitle.py | 8 ++- 5 files changed, 97 insertions(+), 35 deletions(-) diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 5dda9fb3c..df38b4e09 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -518,10 +518,20 @@ def scan_video(path, dont_use_actual_file=False, hints=None, providers=None, ski hints["expected_title"] = [hints["title"]] guessed_result = guessit(guess_from, options=hints) + logger.debug('GuessIt found: %s', json.dumps(guessed_result, cls=GuessitEncoder, indent=4, ensure_ascii=False)) video = Video.fromguess(path, guessed_result) video.hints = hints + # get possibly alternative title from the filename itself + alt_guess = guessit(filename, options=hints) + if "title" in alt_guess and alt_guess["title"] != guessed_result["title"]: + if video_type == "episode": + video.alternative_series.append(alt_guess["title"]) + else: + video.alternative_titles.append(alt_guess["title"]) + logger.debug("Adding alternative title: %s", alt_guess["title"]) + if dont_use_actual_file: return video diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index d6fddb358..d859cd31d 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -12,6 +12,7 @@ from requests import Session, exceptions from urllib3.util import connection from retry.api import retry_call from exceptions import APIThrottled +from cfscrape import CloudflareScraper from subzero.lib.io import get_viable_encoding @@ -30,12 +31,19 @@ custom_resolver = dns.resolver.Resolver(configure=False) custom_resolver.nameservers = ['8.8.8.8', '1.1.1.1'] -class CertifiSession(Session): +class CertifiSession(CloudflareScraper): timeout = 10 def __init__(self): super(CertifiSession, self).__init__() self.verify = pem_file + self.headers.update({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache', + 'DNT': '1' + }) def request(self, *args, **kwargs): if kwargs.get('timeout') is None: @@ -47,7 +55,7 @@ class RetryingSession(CertifiSession): proxied_functions = ("get", "post") def __init__(self): - super(CertifiSession, self).__init__() + super(RetryingSession, self).__init__() self.verify = pem_file proxy = os.environ.get('SZ_HTTP_PROXY') @@ -62,7 +70,7 @@ class RetryingSession(CertifiSession): # fixme: may be a little loud logger.debug("Using proxy %s for: %s", self.proxies["http"], args[0]) - return retry_call(getattr(super(CertifiSession, self), method), fargs=args, fkwargs=kwargs, tries=3, delay=5, + return retry_call(getattr(super(RetryingSession, self), method), fargs=args, fkwargs=kwargs, tries=3, delay=5, exceptions=(exceptions.ConnectionError, exceptions.ProxyError, exceptions.SSLError, diff --git a/libs/subliminal_patch/providers/opensubtitles.py b/libs/subliminal_patch/providers/opensubtitles.py index 032b89058..4ce3aacea 100644 --- a/libs/subliminal_patch/providers/opensubtitles.py +++ b/libs/subliminal_patch/providers/opensubtitles.py @@ -11,8 +11,8 @@ from babelfish import language_converters from dogpile.cache.api import NO_VALUE from subliminal.exceptions import ConfigurationError, ServiceUnavailable from subliminal.providers.opensubtitles import OpenSubtitlesProvider as _OpenSubtitlesProvider,\ - OpenSubtitlesSubtitle as _OpenSubtitlesSubtitle, Episode, ServerProxy, Unauthorized, NoSession, \ - DownloadLimitReached, InvalidImdbid, UnknownUserAgent, DisabledUserAgent, OpenSubtitlesError + OpenSubtitlesSubtitle as _OpenSubtitlesSubtitle, Episode, Movie, ServerProxy, Unauthorized, NoSession, \ + DownloadLimitReached, InvalidImdbid, UnknownUserAgent, DisabledUserAgent, OpenSubtitlesError, sanitize from mixins import ProviderRetryMixin from subliminal.subtitle import fix_line_ending from subliminal_patch.http import SubZeroRequestsTransport @@ -45,6 +45,19 @@ class OpenSubtitlesSubtitle(_OpenSubtitlesSubtitle): def get_matches(self, video, hearing_impaired=False): matches = super(OpenSubtitlesSubtitle, self).get_matches(video) + # episode + if isinstance(video, Episode) and self.movie_kind == 'episode': + # series + if video.series and (sanitize(self.series_name) in ( + sanitize(name) for name in [video.series] + video.alternative_series)): + matches.add('series') + # movie + elif isinstance(video, Movie) and self.movie_kind == 'movie': + # title + if video.title and (sanitize(self.movie_name) in ( + sanitize(name) for name in [video.title] + video.alternative_titles)): + matches.add('title') + sub_fps = None try: sub_fps = float(self.fps) @@ -205,19 +218,19 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): season = episode = None if isinstance(video, Episode): - query = video.series + query = [video.series] + video.alternative_series season = video.season episode = episode = min(video.episode) if isinstance(video.episode, list) else video.episode if video.is_special: season = None episode = None - query = u"%s %s" % (video.series, video.title) + query = [u"%s %s" % (series, video.title) for series in [video.series] + video.alternative_series] logger.info("%s: Searching for special: %r", self.__class__, query) # elif ('opensubtitles' not in video.hashes or not video.size) and not video.imdb_id: # query = video.name.split(os.sep)[-1] else: - query = video.title + query = [video.title] + video.alternative_titles return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id, query=query, season=season, episode=episode, tag=video.original_name, @@ -238,9 +251,11 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): else: criteria.append({'imdbid': imdb_id[2:]}) if query and season and episode: - criteria.append({'query': query.replace('\'', ''), 'season': season, 'episode': episode}) + for q in query: + criteria.append({'query': q.replace('\'', ''), 'season': season, 'episode': episode}) elif query: - criteria.append({'query': query.replace('\'', '')}) + for q in query: + criteria.append({'query': q.replace('\'', '')}) if not criteria: raise ValueError('Not enough information') diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 38a97c579..7a17e1365 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -5,6 +5,7 @@ import logging import os import time import inflect +import cfscrape from random import randint from zipfile import ZipFile @@ -12,7 +13,9 @@ from zipfile import ZipFile from babelfish import language_converters from guessit import guessit from requests import Session +from dogpile.cache.api import NO_VALUE from subliminal import Episode, ProviderError +from subliminal.cache import region from subliminal.utils import sanitize_release_group from subliminal_patch.providers import Provider from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin @@ -125,6 +128,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): self.session = Session() from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers['Referer'] = "https://subscene.com" def terminate(self): logger.info("Closing session") @@ -197,44 +201,65 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): vfn = get_video_filename(video) subtitles = [] logger.debug(u"Searching for: %s", vfn) + + cf_data = region.get("cf_data") + if cf_data is not NO_VALUE: + cf_cookies, user_agent = cf_data + logger.debug("Trying to use old cf cookies") + self.session.cookies.update(cf_cookies) + self.session.headers['User-Agent'] = user_agent + film = search(vfn, session=self.session) + + try: + cf_data = self.session.get_live_tokens("subscene.com") + except: + pass + else: + logger.debug("Storing cf cookies") + region.set("cf_data", cf_data) + if film and film.subtitles: logger.debug('Release results found: %s', len(film.subtitles)) subtitles = self.parse_results(video, film) else: logger.debug('No release results found') + time.sleep(self.search_throttle) + # re-search for episodes without explicit release name if isinstance(video, Episode): #term = u"%s S%02iE%02i" % (video.series, video.season, video.episode) - term = u"%s - %s Season" % (video.series, p.number_to_words("%sth" % video.season).capitalize()) - time.sleep(self.search_throttle) - logger.debug('Searching for alternative results: %s', term) - film = search(term, session=self.session, release=False) - if film and film.subtitles: - logger.debug('Alternative results found: %s', len(film.subtitles)) - subtitles += self.parse_results(video, film) - else: - logger.debug('No alternative results found') - - # packs - if video.season_fully_aired: - term = u"%s S%02i" % (video.series, video.season) - logger.debug('Searching for packs: %s', term) + for series in [video.series] + video.alternative_series: + term = u"%s - %s Season" % (series, p.number_to_words("%sth" % video.season).capitalize()) time.sleep(self.search_throttle) - film = search(term, session=self.session) + logger.debug('Searching for alternative results: %s', term) + film = search(term, session=self.session, release=False) if film and film.subtitles: - logger.debug('Pack results found: %s', len(film.subtitles)) + logger.debug('Alternative results found: %s', len(film.subtitles)) subtitles += self.parse_results(video, film) else: - logger.debug('No pack results found') - else: - logger.debug("Not searching for packs, because the season hasn't fully aired") + logger.debug('No alternative results found') + + # packs + if video.season_fully_aired: + term = u"%s S%02i" % (series, video.season) + logger.debug('Searching for packs: %s', term) + time.sleep(self.search_throttle) + film = search(term, session=self.session) + if film and film.subtitles: + logger.debug('Pack results found: %s', len(film.subtitles)) + subtitles += self.parse_results(video, film) + else: + logger.debug('No pack results found') + else: + logger.debug("Not searching for packs, because the season hasn't fully aired") else: - logger.debug('Searching for movie results: %s', video.title) - film = search(video.title, year=video.year, session=self.session, limit_to=None, release=False) - if film and film.subtitles: - subtitles += self.parse_results(video, film) + for title in [video.title] + video.alternative_titles: + logger.debug('Searching for movie results: %s', title) + film = search(title, year=video.year, session=self.session, limit_to=None, release=False) + if film and film.subtitles: + subtitles += self.parse_results(video, film) logger.info("%s subtitles found" % len(subtitles)) return subtitles diff --git a/libs/subliminal_patch/subtitle.py b/libs/subliminal_patch/subtitle.py index 9a165fe4b..69a3c1e5b 100644 --- a/libs/subliminal_patch/subtitle.py +++ b/libs/subliminal_patch/subtitle.py @@ -38,6 +38,8 @@ class Subtitle(Subtitle_): plex_media_fps = None skip_wrong_fps = False wrong_fps = False + wrong_series = False + wrong_season_ep = False is_pack = False asked_for_release_group = None asked_for_episode = None @@ -356,7 +358,8 @@ def guess_matches(video, guess, partial=False): matches = set() if isinstance(video, Episode): # series - if video.series and 'title' in guess and sanitize(guess['title']) == sanitize(video.series): + if video.series and 'title' in guess and sanitize(guess['title']) in ( + sanitize(name) for name in [video.series] + video.alternative_series): matches.add('series') # title if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title): @@ -384,7 +387,8 @@ def guess_matches(video, guess, partial=False): if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') # title - if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title): + if video.title and 'title' in guess and sanitize(guess['title']) in ( + sanitize(name) for name in [video.title] + video.alternative_titles): matches.add('title') # release_group From 1bf40127a0193c7cdbf79b6045a7d59ce7937b91 Mon Sep 17 00:00:00 2001 From: panni Date: Thu, 4 Apr 2019 19:16:31 +0200 Subject: [PATCH 05/19] core: update to subliminal_patch:head; implicitly handle cf --- libs/subliminal_patch/http.py | 40 +++++++++++++++++++-- libs/subliminal_patch/providers/subscene.py | 16 --------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index d859cd31d..465d5555e 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -8,12 +8,19 @@ import requests import xmlrpclib import dns.resolver -from requests import Session, exceptions +from requests import exceptions from urllib3.util import connection from retry.api import retry_call from exceptions import APIThrottled +from dogpile.cache.api import NO_VALUE +from subliminal.cache import region from cfscrape import CloudflareScraper +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + from subzero.lib.io import get_viable_encoding logger = logging.getLogger(__name__) @@ -45,10 +52,37 @@ class CertifiSession(CloudflareScraper): 'DNT': '1' }) - def request(self, *args, **kwargs): + def request(self, method, url, *args, **kwargs): if kwargs.get('timeout') is None: kwargs['timeout'] = self.timeout - return super(CertifiSession, self).request(*args, **kwargs) + + parsed_url = urlparse(url) + domain = parsed_url.netloc + + cache_key = "cf_data_%s" % domain + + if not self.cookies.get("__cfduid", "", domain=domain) or not self.cookies.get("cf_clearance", "", + domain=domain): + cf_data = region.get(cache_key) + if cf_data is not NO_VALUE: + cf_cookies, user_agent = cf_data + logger.debug("Trying to use old cf data for %s: %s", domain, cf_data) + for cookie, value in cf_cookies.iteritems(): + self.cookies.set(cookie, value, domain=domain) + + self.headers['User-Agent'] = user_agent + + ret = super(CertifiSession, self).request(method, url, *args, **kwargs) + try: + cf_data = self.get_live_tokens(domain) + except: + pass + else: + if cf_data != region.get(cache_key): + logger.debug("Storing cf data for %s: %s", domain, cf_data) + region.set(cache_key, cf_data) + + return ret class RetryingSession(CertifiSession): diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 7a17e1365..d6a294cdb 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -201,24 +201,8 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): vfn = get_video_filename(video) subtitles = [] logger.debug(u"Searching for: %s", vfn) - - cf_data = region.get("cf_data") - if cf_data is not NO_VALUE: - cf_cookies, user_agent = cf_data - logger.debug("Trying to use old cf cookies") - self.session.cookies.update(cf_cookies) - self.session.headers['User-Agent'] = user_agent - film = search(vfn, session=self.session) - try: - cf_data = self.session.get_live_tokens("subscene.com") - except: - pass - else: - logger.debug("Storing cf cookies") - region.set("cf_data", cf_data) - if film and film.subtitles: logger.debug('Release results found: %s', len(film.subtitles)) subtitles = self.parse_results(video, film) From f9aae9e10d28018161e57392d997960ead7a4f87 Mon Sep 17 00:00:00 2001 From: Halali Date: Thu, 4 Apr 2019 23:46:14 +0200 Subject: [PATCH 06/19] Add missing cfscrape lib --- libs/cfscrape.py | 279 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 libs/cfscrape.py diff --git a/libs/cfscrape.py b/libs/cfscrape.py new file mode 100644 index 000000000..15986f03a --- /dev/null +++ b/libs/cfscrape.py @@ -0,0 +1,279 @@ +import logging +import random +import time +import re + +# based off of https://gist.github.com/doko-desuka/58d9212461f62583f8df9bc6387fade2 +# and https://github.com/Anorov/cloudflare-scrape +# and https://github.com/VeNoMouS/cloudflare-scrape-js2py + +''''''''' +Disables InsecureRequestWarning: Unverified HTTPS request is being made warnings. +''''''''' +import requests +from requests.packages.urllib3.exceptions import InsecureRequestWarning + +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) +'''''' +from requests.sessions import Session +from copy import deepcopy + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + +DEFAULT_USER_AGENTS = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36", + "Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" +] + +DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS) + +BUG_REPORT = ( + "Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a " + "bug report at https://github.com/Anorov/cloudflare-scrape/issues.") + + +class CloudflareScraper(Session): + def __init__(self, *args, **kwargs): + super(CloudflareScraper, self).__init__(*args, **kwargs) + + if "requests" in self.headers["User-Agent"]: + # Spoof Firefox on Linux if no custom User-Agent has been set + self.headers["User-Agent"] = random.choice(DEFAULT_USER_AGENTS) + + def request(self, method, url, *args, **kwargs): + resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) + + # Check if Cloudflare anti-bot is on + if (resp.status_code in (503, 429) + and resp.headers.get("Server", "").startswith("cloudflare") + and b"jschl_vc" in resp.content + and b"jschl_answer" in resp.content + ): + return self.solve_cf_challenge(resp, **kwargs) + + # Otherwise, no Cloudflare anti-bot detected + return resp + + def solve_cf_challenge(self, resp, **original_kwargs): + body = resp.text + parsed_url = urlparse(resp.url) + domain = parsed_url.netloc + submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) + + cloudflare_kwargs = deepcopy(original_kwargs) + params = cloudflare_kwargs.setdefault("params", {}) + headers = cloudflare_kwargs.setdefault("headers", {}) + headers["Referer"] = resp.url + + try: + cf_delay = float(re.search('submit.*?(\d+)', body, re.DOTALL).group(1)) / 1000.0 + + form_index = body.find('id="challenge-form"') + if form_index == -1: + raise Exception('CF form not found') + sub_body = body[form_index:] + + s_match = re.search('name="s" value="(.+?)"', sub_body) + if s_match: + params["s"] = s_match.group(1) # On older variants this parameter is absent. + params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', sub_body).group(1) + params["pass"] = re.search(r'name="pass" value="(.+?)"', sub_body).group(1) + + if body.find('id="cf-dn-', form_index) != -1: + extra_div_expression = re.search('id="cf-dn-.*?>(.+?)<', sub_body).group(1) + + # Initial value. + js_answer = self.cf_parse_expression( + re.search('setTimeout\(function\(.*?:(.*?)}', body, re.DOTALL).group(1) + ) + # Extract the arithmetic operations. + builder = re.search("challenge-form'\);\s*;(.*);a.value", body, re.DOTALL).group(1) + # Remove a function semicolon before splitting on semicolons, else it messes the order. + lines = builder.replace(' return +(p)}();', '', 1).split(';') + + for line in lines: + if len(line) and '=' in line: + heading, expression = line.split('=', 1) + if 'eval(eval(atob' in expression: + # Uses the expression in an external
. + expression_value = self.cf_parse_expression(extra_div_expression) + elif '(function(p' in expression: + # Expression + domain sampling function. + expression_value = self.cf_parse_expression(expression, domain) + else: + expression_value = self.cf_parse_expression(expression) + js_answer = self.cf_arithmetic_op(heading[-1], js_answer, expression_value) + + if '+ t.length' in body: + js_answer += len(domain) # Only older variants add the domain length. + + params["jschl_answer"] = '%.10f' % js_answer + + except Exception as e: + # Something is wrong with the page. + # This may indicate Cloudflare has changed their anti-bot + # technique. If you see this and are running the latest version, + # please open a GitHub issue so I can update the code accordingly. + logging.error("[!] %s Unable to parse Cloudflare anti-bots page. " + "Try upgrading cloudflare-scrape, or submit a bug report " + "if you are running the latest version. Please read " + "https://github.com/Anorov/cloudflare-scrape#updates " + "before submitting a bug report." % e) + raise + + # Cloudflare requires a delay before solving the challenge. + # Always wait the full delay + 1s because of 'time.sleep()' imprecision. + time.sleep(cf_delay + 1.0) + + # Requests transforms any request into a GET after a redirect, + # so the redirect has to be handled manually here to allow for + # performing other types of requests even as the first request. + method = resp.request.method + cloudflare_kwargs["allow_redirects"] = False + + redirect = self.request(method, submit_url, **cloudflare_kwargs) + + if 'Location' in redirect.headers: + redirect_location = urlparse(redirect.headers["Location"]) + if not redirect_location.netloc: + redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path) + return self.request(method, redirect_url, **original_kwargs) + return self.request(method, redirect.headers["Location"], **original_kwargs) + else: + return redirect + + def cf_sample_domain_function(self, func_expression, domain): + parameter_start_index = func_expression.find('}(') + 2 + # Send the expression with the "+" char and enclosing parenthesis included, as they are + # stripped inside ".cf_parse_expression()'. + sample_index = self.cf_parse_expression( + func_expression[parameter_start_index: func_expression.rfind(')))')] + ) + return ord(domain[int(sample_index)]) + + def cf_arithmetic_op(self, op, a, b): + if op == '+': + return a + b + elif op == '/': + return a / float(b) + elif op == '*': + return a * float(b) + elif op == '-': + return a - b + else: + raise Exception('Unknown operation') + + def cf_parse_expression(self, expression, domain=None): + + def _get_jsfuck_number(section): + digit_expressions = section.replace('!+[]', '1').replace('+!![]', '1').replace('+[]', '0').split('+') + return int( + # Form a number string, with each digit as the sum of the values inside each parenthesis block. + ''.join( + str(sum(int(digit_char) for digit_char in digit_expression[1:-1])) # Strip the parenthesis. + for digit_expression in digit_expressions + ) + ) + + if '/' in expression: + dividend, divisor = expression.split('/') + dividend = dividend[2:-1] # Strip the leading '+' char and the enclosing parenthesis. + + if domain: + # 2019-04-02: At this moment, this extra domain sampling function always appears on the + # divisor side, at the end. + divisor_a, divisor_b = divisor.split('))+(') + divisor_a = _get_jsfuck_number(divisor_a[5:]) # Left-strip the sequence of "(+(+(". + divisor_b = self.cf_sample_domain_function(divisor_b, domain) + return _get_jsfuck_number(dividend) / float(divisor_a + divisor_b) + else: + divisor = divisor[2:-1] + return _get_jsfuck_number(dividend) / float(_get_jsfuck_number(divisor)) + else: + return _get_jsfuck_number(expression[2:-1]) + + @classmethod + def create_scraper(cls, sess=None, **kwargs): + """ + Convenience function for creating a ready-to-go requests.Session (subclass) object. + """ + scraper = cls() + + if sess: + attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"] + for attr in attrs: + val = getattr(sess, attr, None) + if val: + setattr(scraper, attr, val) + + return scraper + + ## Functions for integrating cloudflare-scrape with other applications and scripts + + @classmethod + def get_tokens(cls, url, user_agent=None, **kwargs): + scraper = cls.create_scraper() + if user_agent: + scraper.headers["User-Agent"] = user_agent + + try: + resp = scraper.get(url, **kwargs) + resp.raise_for_status() + except Exception as e: + logging.error("'%s' returned an error. Could not collect tokens." % url) + raise + + domain = urlparse(resp.url).netloc + cookie_domain = None + + for d in scraper.cookies.list_domains(): + if d.startswith(".") and d in ("." + domain): + cookie_domain = d + break + else: + raise ValueError( + "Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") + + return ({ + "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain), + "cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain) + }, + scraper.headers["User-Agent"] + ) + + def get_live_tokens(self, domain): + for d in self.cookies.list_domains(): + if d.startswith(".") and d in ("." + domain): + cookie_domain = d + break + else: + raise ValueError( + "Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") + + return ({ + "__cfduid": self.cookies.get("__cfduid", "", domain=cookie_domain), + "cf_clearance": self.cookies.get("cf_clearance", "", domain=cookie_domain) + }, + self.headers["User-Agent"] + ) + + @classmethod + def get_cookie_string(cls, url, user_agent=None, **kwargs): + """ + Convenience function for building a Cookie HTTP header value. + """ + tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs) + return "; ".join("=".join(pair) for pair in tokens.items()), user_agent + + +create_scraper = CloudflareScraper.create_scraper +get_tokens = CloudflareScraper.get_tokens +get_cookie_string = CloudflareScraper.get_cookie_string From 2eeedd5efa9bcbf585c487ed3074dcc0ee007782 Mon Sep 17 00:00:00 2001 From: jonudewux Date: Fri, 5 Apr 2019 13:25:34 +0300 Subject: [PATCH 07/19] subssabbz: Small fix (#380) provider selects wrong link and raise a exception: ValueError: Not a valid archive --- libs/subliminal_patch/providers/subssabbz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/subliminal_patch/providers/subssabbz.py b/libs/subliminal_patch/providers/subssabbz.py index 17df5b975..d3d138884 100644 --- a/libs/subliminal_patch/providers/subssabbz.py +++ b/libs/subliminal_patch/providers/subssabbz.py @@ -118,7 +118,7 @@ class SubsSabBzProvider(Provider): for row in rows[:10]: a_element_wrapper = row.find('td', { 'class': 'c2field' }) if a_element_wrapper: - element = row.find('a') + element = a_element_wrapper.find('a') if element: link = element.get('href') logger.info('Found subtitle link %r', link) From 0d05000e97d95b9f73d983ae1e20acecf60205bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sat, 6 Apr 2019 08:25:27 -0400 Subject: [PATCH 08/19] Initial anticaptcha commit --- bazarr/init.py | 3 + libs/subliminal_patch/providers/addic7ed.py | 91 +++-- .../providers/greeksubtitles.py | 184 ---------- libs/subliminal_patch/providers/subs4free.py | 283 ---------------- .../subliminal_patch/providers/subs4series.py | 272 --------------- libs/subliminal_patch/providers/subssabbz.py | 159 --------- libs/subliminal_patch/providers/subsunacs.py | 161 --------- libs/subliminal_patch/providers/subz.py | 318 ------------------ libs/subliminal_patch/providers/xsubs.py | 302 ----------------- libs/subliminal_patch/refiners/omdb.py | 10 +- 10 files changed, 82 insertions(+), 1701 deletions(-) delete mode 100644 libs/subliminal_patch/providers/greeksubtitles.py delete mode 100644 libs/subliminal_patch/providers/subs4free.py delete mode 100644 libs/subliminal_patch/providers/subs4series.py delete mode 100644 libs/subliminal_patch/providers/subssabbz.py delete mode 100644 libs/subliminal_patch/providers/subsunacs.py delete mode 100644 libs/subliminal_patch/providers/subz.py delete mode 100644 libs/subliminal_patch/providers/xsubs.py diff --git a/bazarr/init.py b/bazarr/init.py index eb3af0ce3..7c13cc24d 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -17,6 +17,9 @@ from get_args import args # set subliminal_patch user agent os.environ["SZ_USER_AGENT"] = "Bazarr/1" +# set anticaptcha account key +os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.general.anticaptcha_key + # Check if args.config_dir exist if not os.path.exists(args.config_dir): # Create config_dir directory tree diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index 51913d887..086343e98 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -1,13 +1,16 @@ # coding=utf-8 import logging import re +import os import datetime import subliminal import time +import requests + from random import randint from dogpile.cache.api import NO_VALUE from requests import Session - +from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError from subliminal.providers.addic7ed import Addic7edProvider as _Addic7edProvider, \ Addic7edSubtitle as _Addic7edSubtitle, ParserBeautifulSoup, show_cells_re @@ -15,7 +18,7 @@ from subliminal.cache import region from subliminal.subtitle import fix_line_ending from subliminal_patch.utils import sanitize from subliminal_patch.exceptions import TooManyRequests - +from subliminal_patch.pitcher import pitchers from subzero.language import Language logger = logging.getLogger(__name__) @@ -64,6 +67,7 @@ class Addic7edProvider(_Addic7edProvider): USE_ADDICTED_RANDOM_AGENTS = False hearing_impaired_verifiable = True subtitle_class = Addic7edSubtitle + server_url = 'https://www.addic7ed.com/' sanitize_characters = {'-', ':', '(', ')', '.', '/'} @@ -75,45 +79,90 @@ class Addic7edProvider(_Addic7edProvider): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ - if self.USE_ADDICTED_RANDOM_AGENTS: - from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST - logger.debug("Addic7ed: using random user agents") - self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] - self.session.headers['Referer'] = self.server_url + from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST + logger.debug("Addic7ed: using random user agents") + self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers['Referer'] = self.server_url # login if self.username and self.password: - ccks = region.get("addic7ed_cookies", expiration_time=86400) + ccks = region.get("addic7ed_data", expiration_time=15552000) # 6m if ccks != NO_VALUE: + cookies, user_agent = ccks + logger.debug("Addic7ed: Re-using previous user agent") + self.session.headers["User-Agent"] = user_agent try: - self.session.cookies._cookies.update(ccks) - r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10) + self.session.cookies._cookies.update(cookies) + r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, + headers={"Referer": self.server_url}) if r.status_code == 302: logger.info('Addic7ed: Login expired') - region.delete("addic7ed_cookies") + region.delete("addic7ed_data") else: - logger.info('Addic7ed: Reusing old login') + logger.info('Addic7ed: Re-using old login') self.logged_in = True return except: pass logger.info('Addic7ed: Logging in') - data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} - r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, - headers={"Referer": self.server_url + "login.php"}) + data = {'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', + 'remember': 'true'} + + tries = 0 + while tries < 3: + r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) + if "grecaptcha" in r.content: + logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' + 'happen once every so often') + anticaptcha_key = os.environ.get("ANTICAPTCHA_ACCOUNT_KEY") + if not anticaptcha_key: + logger.error("AntiCaptcha key not given, exiting") + return + + anticaptcha_proxy = os.environ.get("ANTICAPTCHA_PROXY") + + site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) + if not site_key: + logger.error("Addic7ed: Captcha site-key not found!") + return - if "relax, slow down" in r.content: - raise TooManyRequests(self.username) + #pitcher_cls = pitchers.get_pitcher("AntiCaptchaProxyLess") + #pitcher = pitcher_cls("Addic7ed", anticaptcha_key, self.server_url + 'login.php', site_key) + pitcher_cls = pitchers.get_pitcher("AntiCaptchaProxyLess") + pitcher = pitcher_cls("Addic7ed", anticaptcha_key, self.server_url + 'login.php', site_key, + user_agent=self.session.headers["User-Agent"], + cookies=self.session.cookies.get_dict(), + is_invisible=True) - if r.status_code != 302: - raise AuthenticationError(self.username) + result = pitcher.throw() + if not result: + raise Exception("Addic7ed: Couldn't solve captcha!") + + data["recaptcha_response"] = result + + r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, + headers={"Referer": self.server_url + "login.php"}) + + if "relax, slow down" in r.content: + raise TooManyRequests(self.username) + + if r.status_code != 302: + if "User doesn't exist" in r.content and tries <= 2: + logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3) + tries += 1 + continue + + raise AuthenticationError(self.username) + break - region.set("addic7ed_cookies", self.session.cookies._cookies) + region.set("addic7ed_data", (self.session.cookies._cookies, self.session.headers["User-Agent"])) logger.debug('Addic7ed: Logged in') self.logged_in = True + def terminate(self): + pass @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): @@ -140,7 +189,7 @@ class Addic7edProvider(_Addic7edProvider): # populate the show ids show_ids = {} - for show in soup.select('td.version > h3 > a[href^="/show/"]'): + for show in soup.select('td > h3 > a[href^="/show/"]'): show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) diff --git a/libs/subliminal_patch/providers/greeksubtitles.py b/libs/subliminal_patch/providers/greeksubtitles.py deleted file mode 100644 index 98dfc289e..000000000 --- a/libs/subliminal_patch/providers/greeksubtitles.py +++ /dev/null @@ -1,184 +0,0 @@ -# -*- coding: utf-8 -*- -import io -import logging -import os -import zipfile - -import rarfile -from subzero.language import Language -from guessit import guessit -from requests import Session -from six import text_type - -from subliminal import __short_version__ -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches -from subliminal.video import Episode, Movie - -logger = logging.getLogger(__name__) - - -class GreekSubtitlesSubtitle(Subtitle): - """GreekSubtitles Subtitle.""" - provider_name = 'greeksubtitles' - - def __init__(self, language, page_link, version, download_link): - super(GreekSubtitlesSubtitle, self).__init__(language, page_link=page_link) - self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'windows-1253' - - @property - def id(self): - return self.download_link - - def get_matches(self, video): - matches = set() - - # episode - if isinstance(video, Episode): - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) - # movie - elif isinstance(video, Movie): - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) - - return matches - - -class GreekSubtitlesProvider(Provider): - """GreekSubtitles Provider.""" - languages = {Language(l) for l in ['ell', 'eng']} - server_url = 'http://gr.greek-subtitles.com/' - search_url = 'search.php?name={}' - download_url = 'http://www.greeksubtitles.info/getp.php?id={:d}' - subtitle_class = GreekSubtitlesSubtitle - - def __init__(self): - self.session = None - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) - - def terminate(self): - self.session.close() - - def query(self, keyword, season=None, episode=None, year=None): - params = keyword - if season and episode: - params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) - elif year: - params += ' {:4d}'.format(year) - - logger.debug('Searching subtitles %r', params) - subtitles = [] - search_link = self.server_url + text_type(self.search_url).format(params) - while True: - r = self.session.get(search_link, timeout=30) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) - - # loop over subtitles cells - for cell in soup.select('td.latest_name > a:nth-of-type(1)'): - # read the item - subtitle_id = int(cell['href'].rsplit('/', 2)[1]) - page_link = cell['href'] - language = Language.fromalpha2(cell.parent.find('img')['src'].split('/')[-1].split('.')[0]) - version = cell.text.strip() or None - if version is None: - version = "" - - subtitle = self.subtitle_class(language, page_link, version, self.download_url.format(subtitle_id)) - - logger.debug('Found subtitle %r', subtitle) - subtitles.append(subtitle) - - anchors = soup.select('td a') - next_page_available = False - for anchor in anchors: - if 'Next' in anchor.text and 'search.php' in anchor['href']: - search_link = self.server_url + anchor['href'] - next_page_available = True - break - if not next_page_available: - break - - return subtitles - - def list_subtitles(self, video, languages): - if isinstance(video, Episode): - titles = [video.series] + video.alternative_series - elif isinstance(video, Movie): - titles = [video.title] + video.alternative_titles - else: - titles = [] - - subtitles = [] - # query for subtitles with the show_id - for title in titles: - if isinstance(video, Episode): - subtitles += [s for s in self.query(title, season=video.season, episode=video.episode, - year=video.year) - if s.language in languages] - elif isinstance(video, Movie): - subtitles += [s for s in self.query(title, year=video.year) - if s.language in languages] - - return subtitles - - def download_subtitle(self, subtitle): - if isinstance(subtitle, GreekSubtitlesSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, - timeout=30) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - archive = _get_archive(r.content) - - subtitle_content = _get_subtitle_from_archive(archive) - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - else: - logger.debug('Could not extract subtitle from %r', archive) - - -def _get_archive(content): - # open the archive - archive_stream = io.BytesIO(content) - archive = None - if rarfile.is_rarfile(archive_stream): - logger.debug('Identified rar archive') - archive = rarfile.RarFile(archive_stream) - elif zipfile.is_zipfile(archive_stream): - logger.debug('Identified zip archive') - archive = zipfile.ZipFile(archive_stream) - - return archive - - -def _get_subtitle_from_archive(archive): - for name in archive.namelist(): - # discard hidden files - if os.path.split(name)[-1].startswith('.'): - continue - - # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): - continue - - return archive.read(name) - - return None diff --git a/libs/subliminal_patch/providers/subs4free.py b/libs/subliminal_patch/providers/subs4free.py deleted file mode 100644 index 181b99351..000000000 --- a/libs/subliminal_patch/providers/subs4free.py +++ /dev/null @@ -1,283 +0,0 @@ -# -*- coding: utf-8 -*- -# encoding=utf8 -import io -import logging -import os -import random - -import rarfile -import re -import zipfile - -from subzero.language import Language -from guessit import guessit -from requests import Session -from six import text_type - -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal import __short_version__ -from subliminal.cache import SHOW_EXPIRATION_TIME, region -from subliminal.score import get_equivalent_release_groups -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches -from subliminal.utils import sanitize, sanitize_release_group -from subliminal.video import Movie - -logger = logging.getLogger(__name__) - -year_re = re.compile(r'^\((\d{4})\)$') - - -class Subs4FreeSubtitle(Subtitle): - """Subs4Free Subtitle.""" - provider_name = 'subs4free' - - def __init__(self, language, page_link, title, year, version, download_link): - super(Subs4FreeSubtitle, self).__init__(language, page_link=page_link) - self.title = title - self.year = year - self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'utf8' - - @property - def id(self): - return self.download_link - - def get_matches(self, video): - matches = set() - - # movie - if isinstance(video, Movie): - # title - if video.title and (sanitize(self.title) in ( - sanitize(name) for name in [video.title] + video.alternative_titles)): - matches.add('title') - # year - if video.year and self.year == video.year: - matches.add('year') - - # release_group - if (video.release_group and self.version and - any(r in sanitize_release_group(self.version) - for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): - matches.add('release_group') - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) - - return matches - - -class Subs4FreeProvider(Provider): - """Subs4Free Provider.""" - languages = {Language(l) for l in ['ell', 'eng']} - video_types = (Movie,) - server_url = 'https://www.sf4-industry.com' - download_url = '/getSub.html' - search_url = '/search_report.php?search={}&searchType=1' - subtitle_class = Subs4FreeSubtitle - - def __init__(self): - self.session = None - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) - - def terminate(self): - self.session.close() - - def get_show_ids(self, title, year=None): - """Get the best matching show id for `series` and `year``. - - First search in the result of :meth:`_get_show_suggestions`. - - :param title: show title. - :param year: year of the show, if any. - :type year: int - :return: the show id, if found. - :rtype: str - - """ - title_sanitized = sanitize(title).lower() - show_ids = self._get_suggestions(title) - - matched_show_ids = [] - for show in show_ids: - show_id = None - show_title = sanitize(show['title']) - # attempt with year - if not show_id and year: - logger.debug('Getting show id with year') - show_id = show['link'].split('?p=')[-1] if show_title == '{title} {year:d}'.format( - title=title_sanitized, year=year) else None - - # attempt clean - if not show_id: - logger.debug('Getting show id') - show_id = show['link'].split('?p=')[-1] if show_title == title_sanitized else None - - if show_id: - matched_show_ids.append(show_id) - - return matched_show_ids - - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, - should_cache_fn=lambda value: value) - def _get_suggestions(self, title): - """Search the show or movie id from the `title` and `year`. - - :param str title: title of the show. - :return: the show suggestions found. - :rtype: dict - - """ - # make the search - logger.info('Searching show ids with %r', title) - r = self.session.get(self.server_url + text_type(self.search_url).format(title), - headers={'Referer': self.server_url}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return {} - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - suggestions = [{'link': l.attrs['value'], 'title': l.text} - for l in soup.select('select[name="Mov_sel"] > option[value]')] - logger.debug('Found suggestions: %r', suggestions) - - return suggestions - - def query(self, movie_id, title, year): - # get the season list of the show - logger.info('Getting the subtitle list of show id %s', movie_id) - if movie_id: - page_link = self.server_url + '/' + movie_id - else: - page_link = self.server_url + text_type(self.search_url).format(' '.join([title, str(year)])) - - r = self.session.get(page_link, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['html.parser']) - - year_num = None - year_element = soup.select_one('td#dates_header > table div') - matches = False - if year_element: - matches = year_re.match(str(year_element.contents[2]).strip()) - if matches: - year_num = int(matches.group(1)) - - title_element = soup.select_one('td#dates_header > table u') - show_title = str(title_element.contents[0]).strip() if title_element else None - - subtitles = [] - # loop over episode rows - for subtitle in soup.select('table.table_border div[align="center"] > div'): - # read common info - version = subtitle.find('b').text - download_link = self.server_url + subtitle.find('a')['href'] - language = Language.fromalpha2(subtitle.find('img')['src'].split('/')[-1].split('.')[0]) - - subtitle = self.subtitle_class(language, page_link, show_title, year_num, version, download_link) - - logger.debug('Found subtitle {!r}'.format(subtitle)) - subtitles.append(subtitle) - - return subtitles - - def list_subtitles(self, video, languages): - # lookup show_id - titles = [video.title] + video.alternative_titles if isinstance(video, Movie) else [] - - show_ids = None - for title in titles: - show_ids = self.get_show_ids(title, video.year) - if show_ids and len(show_ids) > 0: - break - - subtitles = [] - # query for subtitles with the show_id - if show_ids and len(show_ids) > 0: - for show_id in show_ids: - subtitles += [s for s in self.query(show_id, video.title, video.year) if s.language in languages] - else: - subtitles += [s for s in self.query(None, video.title, video.year) if s.language in languages] - - return subtitles - - def download_subtitle(self, subtitle): - if isinstance(subtitle, Subs4FreeSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - download_element = soup.select_one('input[name="id"]') - image_element = soup.select_one('input[type="image"]') - subtitle_id = download_element['value'] if download_element else None - width = int(str(image_element['width']).strip('px')) if image_element else 0 - height = int(str(image_element['height']).strip('px')) if image_element else 0 - - if not subtitle_id: - logger.debug('Unable to download subtitle. No download link found') - return - - download_url = self.server_url + self.download_url - r = self.session.post(download_url, data={'utf8': 1, 'id': subtitle_id, 'x': random.randint(0, width), - 'y': random.randint(0, height)}, - headers={'Referer': subtitle.download_link}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - archive = _get_archive(r.content) - - subtitle_content = _get_subtitle_from_archive(archive) if archive else r.content - - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - else: - logger.debug('Could not extract subtitle from %r', archive) - - -def _get_archive(content): - # open the archive - archive_stream = io.BytesIO(content) - archive = None - if rarfile.is_rarfile(archive_stream): - logger.debug('Identified rar archive') - archive = rarfile.RarFile(archive_stream) - elif zipfile.is_zipfile(archive_stream): - logger.debug('Identified zip archive') - archive = zipfile.ZipFile(archive_stream) - - return archive - - -def _get_subtitle_from_archive(archive): - for name in archive.namelist(): - # discard hidden files - if os.path.split(name)[-1].startswith('.'): - continue - - # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): - continue - - return archive.read(name) - - return None diff --git a/libs/subliminal_patch/providers/subs4series.py b/libs/subliminal_patch/providers/subs4series.py deleted file mode 100644 index 5f381feeb..000000000 --- a/libs/subliminal_patch/providers/subs4series.py +++ /dev/null @@ -1,272 +0,0 @@ -# -*- coding: utf-8 -*- -import io -import logging -import os - -import rarfile -import re -import zipfile - -from subzero.language import Language -from guessit import guessit -from requests import Session -from six import text_type - -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal import __short_version__ -from subliminal.cache import SHOW_EXPIRATION_TIME, region -from subliminal.score import get_equivalent_release_groups -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches -from subliminal.utils import sanitize, sanitize_release_group -from subliminal.video import Episode - -logger = logging.getLogger(__name__) - -year_re = re.compile(r'^\((\d{4})\)$') - - -class Subs4SeriesSubtitle(Subtitle): - """Subs4Series Subtitle.""" - provider_name = 'subs4series' - - def __init__(self, language, page_link, series, year, version, download_link): - super(Subs4SeriesSubtitle, self).__init__(language, page_link=page_link) - self.series = series - self.year = year - self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'windows-1253' - - @property - def id(self): - return self.download_link - - def get_matches(self, video): - matches = set() - - # episode - if isinstance(video, Episode): - # series name - if video.series and sanitize(self.series) in ( - sanitize(name) for name in [video.series] + video.alternative_series): - matches.add('series') - # year - if video.original_series and self.year is None or video.year and video.year == self.year: - matches.add('year') - - # release_group - if (video.release_group and self.version and - any(r in sanitize_release_group(self.version) - for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): - matches.add('release_group') - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) - - return matches - - -class Subs4SeriesProvider(Provider): - """Subs4Series Provider.""" - languages = {Language(l) for l in ['ell', 'eng']} - video_types = (Episode,) - server_url = 'https://www.subs4series.com' - search_url = '/search_report.php?search={}&searchType=1' - episode_link = '/tv-series/{show_id}/season-{season:d}/episode-{episode:d}' - subtitle_class = Subs4SeriesSubtitle - - def __init__(self): - self.session = None - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) - - def terminate(self): - self.session.close() - - def get_show_ids(self, title, year=None): - """Get the best matching show id for `series` and `year`. - - First search in the result of :meth:`_get_show_suggestions`. - - :param title: show title. - :param year: year of the show, if any. - :type year: int - :return: the show id, if found. - :rtype: str - - """ - title_sanitized = sanitize(title).lower() - show_ids = self._get_suggestions(title) - - matched_show_ids = [] - for show in show_ids: - show_id = None - show_title = sanitize(show['title']) - # attempt with year - if not show_id and year: - logger.debug('Getting show id with year') - show_id = '/'.join(show['link'].rsplit('/', 2)[1:]) if show_title == '{title} {year:d}'.format( - title=title_sanitized, year=year) else None - - # attempt clean - if not show_id: - logger.debug('Getting show id') - show_id = '/'.join(show['link'].rsplit('/', 2)[1:]) if show_title == title_sanitized else None - - if show_id: - matched_show_ids.append(show_id) - - return matched_show_ids - - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, - should_cache_fn=lambda value: value) - def _get_suggestions(self, title): - """Search the show or movie id from the `title` and `year`. - - :param str title: title of the show. - :return: the show suggestions found. - :rtype: dict - - """ - # make the search - logger.info('Searching show ids with %r', title) - r = self.session.get(self.server_url + text_type(self.search_url).format(title), - headers={'Referer': self.server_url}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return {} - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - series = [{'link': l.attrs['value'], 'title': l.text} - for l in soup.select('select[name="Mov_sel"] > option[value]')] - logger.debug('Found suggestions: %r', series) - - return series - - def query(self, show_id, series, season, episode, title): - # get the season list of the show - logger.info('Getting the subtitle list of show id %s', show_id) - if all((show_id, season, episode)): - page_link = self.server_url + self.episode_link.format(show_id=show_id, season=season, episode=episode) - else: - return [] - - r = self.session.get(page_link, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - - year_num = None - matches = year_re.match(str(soup.select_one('#dates_header_br > table div').contents[2]).strip()) - if matches: - year_num = int(matches.group(1)) - show_title = str(soup.select_one('#dates_header_br > table u').contents[0]).strip() - - subtitles = [] - # loop over episode rows - for subtitle in soup.select('table.table_border div[align="center"] > div'): - # read common info - version = subtitle.find('b').text - download_link = self.server_url + subtitle.find('a')['href'] - language = Language.fromalpha2(subtitle.find('img')['src'].split('/')[-1].split('.')[0]) - - subtitle = self.subtitle_class(language, page_link, show_title, year_num, version, download_link) - - logger.debug('Found subtitle %r', subtitle) - subtitles.append(subtitle) - - return subtitles - - def list_subtitles(self, video, languages): - # lookup show_id - titles = [video.series] + video.alternative_series if isinstance(video, Episode) else [] - - show_ids = None - for title in titles: - show_ids = self.get_show_ids(title, video.year) - if show_ids and len(show_ids) > 0: - break - - subtitles = [] - # query for subtitles with the show_id - for show_id in show_ids: - subtitles += [s for s in self.query(show_id, video.series, video.season, video.episode, video.title) - if s.language in languages] - - return subtitles - - def download_subtitle(self, subtitle): - if isinstance(subtitle, Subs4SeriesSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - download_element = soup.select_one('a.style55ws') - if not download_element: - download_element = soup.select_one('form[method="post"]') - target = download_element['action'] if download_element else None - else: - target = download_element['href'] - - if not target: - logger.debug('Unable to download subtitle. No download link found') - return - - download_url = self.server_url + target - r = self.session.get(download_url, headers={'Referer': subtitle.download_link}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - archive = _get_archive(r.content) - subtitle_content = _get_subtitle_from_archive(archive) if archive else r.content - - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - else: - logger.debug('Could not extract subtitle from %r', archive) - - -def _get_archive(content): - # open the archive - archive_stream = io.BytesIO(content) - archive = None - if rarfile.is_rarfile(archive_stream): - logger.debug('Identified rar archive') - archive = rarfile.RarFile(archive_stream) - elif zipfile.is_zipfile(archive_stream): - logger.debug('Identified zip archive') - archive = zipfile.ZipFile(archive_stream) - - return archive - - -def _get_subtitle_from_archive(archive): - for name in archive.namelist(): - # discard hidden files - if os.path.split(name)[-1].startswith('.'): - continue - - # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): - continue - - return archive.read(name) - - return None diff --git a/libs/subliminal_patch/providers/subssabbz.py b/libs/subliminal_patch/providers/subssabbz.py deleted file mode 100644 index d3d138884..000000000 --- a/libs/subliminal_patch/providers/subssabbz.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -import re -import io -import os -from random import randint -from bs4 import BeautifulSoup -from zipfile import ZipFile, is_zipfile -from rarfile import RarFile, is_rarfile -from requests import Session -from guessit import guessit -from subliminal_patch.providers import Provider -from subliminal_patch.subtitle import Subtitle -from subliminal_patch.utils import sanitize -from subliminal.exceptions import ProviderError -from subliminal.utils import sanitize_release_group -from subliminal.subtitle import guess_matches -from subliminal.video import Episode, Movie -from subliminal.subtitle import fix_line_ending -from subzero.language import Language -from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST - -logger = logging.getLogger(__name__) - -class SubsSabBzSubtitle(Subtitle): - """SubsSabBz Subtitle.""" - provider_name = 'subssabbz' - - def __init__(self, langauge, filename, type): - super(SubsSabBzSubtitle, self).__init__(langauge) - self.langauge = langauge - self.filename = filename - self.type = type - - @property - def id(self): - return self.filename - - def get_matches(self, video): - matches = set() - - video_filename = video.name - video_filename = os.path.basename(video_filename) - video_filename, _ = os.path.splitext(video_filename) - video_filename = sanitize_release_group(video_filename) - - subtitle_filename = self.filename - subtitle_filename = os.path.basename(subtitle_filename) - subtitle_filename, _ = os.path.splitext(subtitle_filename) - subtitle_filename = sanitize_release_group(subtitle_filename) - - if video_filename == subtitle_filename: - matches.add('hash') - - matches |= guess_matches(video, guessit(self.filename, {'type': self.type})) - - matches.add(id(self)) - return matches - - -class SubsSabBzProvider(Provider): - """SubsSabBz Provider.""" - languages = {Language('por', 'BR')} | {Language(l) for l in [ - 'bul', 'eng' - ]} - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] - self.session.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - self.session.headers["Accept-Language"] = "en-US,en;q=0.5" - self.session.headers["Accept-Encoding"] = "gzip, deflate, br" - self.session.headers["DNT"] = "1" - self.session.headers["Connection"] = "keep-alive" - self.session.headers["Upgrade-Insecure-Requests"] = "1" - self.session.headers["Cache-Control"] = "max-age=0" - - def terminate(self): - self.session.close() - - def query(self, language, video): - subtitles = [] - isEpisode = isinstance(video, Episode) - - params = { - 'act': 'search', - 'movie': '', - 'select-language': '2', - 'upldr': '', - 'yr': '', - 'release': '' - } - - if isEpisode: - params['movie'] = "%s %02d %02d" % (sanitize(video.series), video.season, video.episode) - else: - params['yr'] = video.year - params['movie'] = (video.title) - - if language == 'en' or language == 'eng': - params['select-language'] = 1 - - logger.info('Searching subtitle %r', params) - response = self.session.post('http://subs.sab.bz/index.php?', params=params, allow_redirects=False, timeout=10, headers={ - 'Referer': 'http://subs.sab.bz/', - }) - - response.raise_for_status() - - if response.status_code != 200: - logger.debug('No subtitles found') - return subtitles - - soup = BeautifulSoup(response.content, 'html.parser') - rows = soup.findAll('tr', {'class': 'subs-row'}) - - # Search on first 10 rows only - for row in rows[:10]: - a_element_wrapper = row.find('td', { 'class': 'c2field' }) - if a_element_wrapper: - element = a_element_wrapper.find('a') - if element: - link = element.get('href') - logger.info('Found subtitle link %r', link) - subtitles = subtitles + self.download_archive_and_add_subtitle_files(link, language, video) - - return subtitles - - def list_subtitles(self, video, languages): - return [s for l in languages for s in self.query(l, video)] - - def download_subtitle(self, subtitle): - pass - - def process_archive_subtitle_files(self, archiveStream, language, video): - subtitles = [] - type = 'episode' if isinstance(video, Episode) else 'movie' - for file_name in archiveStream.namelist(): - if file_name.lower().endswith(('.srt', '.sub')): - logger.info('Found subtitle file %r', file_name) - subtitle = SubsSabBzSubtitle(language, file_name, type) - subtitle.content = archiveStream.read(file_name) - subtitles.append(subtitle) - return subtitles - - def download_archive_and_add_subtitle_files(self, link, language, video ): - logger.info('Downloading subtitle %r', link) - request = self.session.get(link, headers={ - 'Referer': 'http://subs.sab.bz/index.php?' - }) - request.raise_for_status() - - archive_stream = io.BytesIO(request.content) - if is_rarfile(archive_stream): - return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) - elif is_zipfile(archive_stream): - return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) - else: - raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/subsunacs.py b/libs/subliminal_patch/providers/subsunacs.py deleted file mode 100644 index bbc41f520..000000000 --- a/libs/subliminal_patch/providers/subsunacs.py +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -import re -import io -import os -from random import randint -from bs4 import BeautifulSoup -from zipfile import ZipFile, is_zipfile -from rarfile import RarFile, is_rarfile -from requests import Session -from guessit import guessit -from subliminal_patch.providers import Provider -from subliminal_patch.subtitle import Subtitle -from subliminal_patch.utils import sanitize -from subliminal.exceptions import ProviderError -from subliminal.utils import sanitize_release_group -from subliminal.subtitle import guess_matches -from subliminal.video import Episode, Movie -from subliminal.subtitle import fix_line_ending -from subzero.language import Language -from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST - -logger = logging.getLogger(__name__) - -class SubsUnacsSubtitle(Subtitle): - """SubsUnacs Subtitle.""" - provider_name = 'subsunacs' - - def __init__(self, langauge, filename, type): - super(SubsUnacsSubtitle, self).__init__(langauge) - self.langauge = langauge - self.filename = filename - self.type = type - - @property - def id(self): - return self.filename - - def get_matches(self, video): - matches = set() - - video_filename = video.name - video_filename = os.path.basename(video_filename) - video_filename, _ = os.path.splitext(video_filename) - video_filename = sanitize_release_group(video_filename) - - subtitle_filename = self.filename - subtitle_filename = os.path.basename(subtitle_filename) - subtitle_filename, _ = os.path.splitext(subtitle_filename) - subtitle_filename = sanitize_release_group(subtitle_filename) - - if video_filename == subtitle_filename: - matches.add('hash') - - matches |= guess_matches(video, guessit(self.filename, {'type': self.type})) - - matches.add(id(self)) - return matches - - -class SubsUnacsProvider(Provider): - """SubsUnacs Provider.""" - languages = {Language('por', 'BR')} | {Language(l) for l in [ - 'bul', 'eng' - ]} - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] - self.session.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - self.session.headers["Accept-Language"] = "en-US,en;q=0.5" - self.session.headers["Accept-Encoding"] = "gzip, deflate, br" - self.session.headers["DNT"] = "1" - self.session.headers["Connection"] = "keep-alive" - self.session.headers["Upgrade-Insecure-Requests"] = "1" - self.session.headers["Cache-Control"] = "max-age=0" - - def terminate(self): - self.session.close() - - def query(self, language, video): - subtitles = [] - isEpisode = isinstance(video, Episode) - - params = { - 'm': '', - 'l': 0, - 'c': '', - 'y': '', - 'action': " Търси ", - 'a': '', - 'd': '', - 'u': '', - 'g': '', - 't': '', - 'imdbcheck': 1} - - if isEpisode: - params['m'] = "%s %02d %02d" % (sanitize(video.series), video.season, video.episode) - else: - params['y'] = video.year - params['m'] = (video.title) - - if language == 'en' or language == 'eng': - params['l'] = 1 - - logger.info('Searching subtitle %r', params) - response = self.session.post('https://subsunacs.net/search.php', params=params, allow_redirects=False, timeout=10, headers={ - 'Referer': 'https://subsunacs.net/index.php', - }) - - response.raise_for_status() - - if response.status_code != 200: - logger.debug('No subtitles found') - return subtitles - - soup = BeautifulSoup(response.content, 'html.parser') - rows = soup.findAll('td', {'class': 'tdMovie'}) - - # Search on first 10 rows only - for row in rows[:10]: - element = row.find('a', {'class': 'tooltip'}) - if element: - link = element.get('href') - logger.info('Found subtitle link %r', link) - subtitles = subtitles + self.download_archive_and_add_subtitle_files('https://subsunacs.net' + link, language, video) - - return subtitles - - def list_subtitles(self, video, languages): - return [s for l in languages for s in self.query(l, video)] - - def download_subtitle(self, subtitle): - pass - - def process_archive_subtitle_files(self, archiveStream, language, video): - subtitles = [] - type = 'episode' if isinstance(video, Episode) else 'movie' - for file_name in archiveStream.namelist(): - if file_name.lower().endswith(('.srt', '.sub')): - logger.info('Found subtitle file %r', file_name) - subtitle = SubsUnacsSubtitle(language, file_name, type) - subtitle.content = archiveStream.read(file_name) - subtitles.append(subtitle) - return subtitles - - def download_archive_and_add_subtitle_files(self, link, language, video ): - logger.info('Downloading subtitle %r', link) - request = self.session.get(link, headers={ - 'Referer': 'https://subsunacs.net/search.php' - }) - request.raise_for_status() - - archive_stream = io.BytesIO(request.content) - if is_rarfile(archive_stream): - return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) - elif is_zipfile(archive_stream): - return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) - else: - raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/subz.py b/libs/subliminal_patch/providers/subz.py deleted file mode 100644 index dc95cb8d7..000000000 --- a/libs/subliminal_patch/providers/subz.py +++ /dev/null @@ -1,318 +0,0 @@ -# -*- coding: utf-8 -*- -import io -import json -import logging -import os - -import rarfile -import re -import zipfile - -from subzero.language import Language -from guessit import guessit -from requests import Session -from six import text_type - -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal import __short_version__ -from subliminal.cache import SHOW_EXPIRATION_TIME, region -from subliminal.score import get_equivalent_release_groups -from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches -from subliminal.utils import sanitize, sanitize_release_group -from subliminal.video import Episode, Movie - -logger = logging.getLogger(__name__) - -episode_re = re.compile(r'^S(\d{2})E(\d{2})$') - - -class SubzSubtitle(Subtitle): - """Subz Subtitle.""" - provider_name = 'subz' - - def __init__(self, language, page_link, series, season, episode, title, year, version, download_link): - super(SubzSubtitle, self).__init__(language, page_link=page_link) - self.series = series - self.season = season - self.episode = episode - self.title = title - self.year = year - self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'windows-1253' - - @property - def id(self): - return self.download_link - - def get_matches(self, video): - matches = set() - video_type = None - - # episode - if isinstance(video, Episode): - video_type = 'episode' - # series name - if video.series and sanitize(self.series) in ( - sanitize(name) for name in [video.series] + video.alternative_series): - matches.add('series') - # season - if video.season and self.season == video.season: - matches.add('season') - # episode - if video.episode and self.episode == video.episode: - matches.add('episode') - # title of the episode - if video.title and sanitize(self.title) == sanitize(video.title): - matches.add('title') - # year - if video.original_series and self.year is None or video.year and video.year == self.year: - matches.add('year') - # movie - elif isinstance(video, Movie): - video_type = 'movie' - # title - if video.title and (sanitize(self.title) in ( - sanitize(name) for name in [video.title] + video.alternative_titles)): - matches.add('title') - # year - if video.year and self.year == video.year: - matches.add('year') - - # release_group - if (video.release_group and self.version and - any(r in sanitize_release_group(self.version) - for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): - matches.add('release_group') - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': video_type}), partial=True) - - return matches - - -class SubzProvider(Provider): - """Subz Provider.""" - languages = {Language(l) for l in ['ell']} - server_url = 'https://subz.xyz' - sign_in_url = '/sessions' - sign_out_url = '/logout' - search_url = '/typeahead/{}' - episode_link = '/series/{show_id}/seasons/{season:d}/episodes/{episode:d}' - movie_link = '/movies/{}' - subtitle_class = SubzSubtitle - - def __init__(self): - self.logged_in = False - self.session = None - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) - - def terminate(self): - self.session.close() - - def get_show_ids(self, title, year=None, is_episode=True, country_code=None): - """Get the best matching show id for `series`, `year` and `country_code`. - - First search in the result of :meth:`_get_show_suggestions`. - - :param title: show title. - :param year: year of the show, if any. - :type year: int - :param is_episode: if the search is for episode. - :type is_episode: bool - :param country_code: country code of the show, if any. - :type country_code: str - :return: the show id, if found. - :rtype: str - - """ - title_sanitized = sanitize(title).lower() - show_ids = self._get_suggestions(title, is_episode) - - matched_show_ids = [] - for show in show_ids: - show_id = None - # attempt with country - if not show_id and country_code: - logger.debug('Getting show id with country') - if sanitize(show['title']) == text_type('{title} {country}').format(title=title_sanitized, - country=country_code.lower()): - show_id = show['link'].split('/')[-1] - - # attempt with year - if not show_id and year: - logger.debug('Getting show id with year') - if sanitize(show['title']) == text_type('{title} {year}').format(title=title_sanitized, year=year): - show_id = show['link'].split('/')[-1] - - # attempt clean - if not show_id: - logger.debug('Getting show id') - show_id = show['link'].split('/')[-1] if sanitize(show['title']) == title_sanitized else None - - if show_id: - matched_show_ids.append(show_id) - - return matched_show_ids - - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, - should_cache_fn=lambda value: value) - def _get_suggestions(self, title, is_episode=True): - """Search the show or movie id from the `title` and `year`. - - :param str title: title of the show. - :param is_episode: if the search is for episode. - :type is_episode: bool - :return: the show suggestions found. - :rtype: dict - - """ - # make the search - logger.info('Searching show ids with %r', title) - r = self.session.get(self.server_url + text_type(self.search_url).format(title), timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return {} - - show_type = 'series' if is_episode else 'movie' - parsed_suggestions = [s for s in json.loads(r.text) if 'type' in s and s['type'] == show_type] - logger.debug('Found suggestions: %r', parsed_suggestions) - - return parsed_suggestions - - def query(self, show_id, series, season, episode, title): - # get the season list of the show - logger.info('Getting the subtitle list of show id %s', show_id) - is_episode = False - if all((show_id, season, episode)): - is_episode = True - page_link = self.server_url + self.episode_link.format(show_id=show_id, season=season, episode=episode) - elif all((show_id, title)): - page_link = self.server_url + self.movie_link.format(show_id) - else: - return [] - - r = self.session.get(page_link, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - - year_num = None - if not is_episode: - year_num = int(soup.select_one('span.year').text) - show_title = str(soup.select_one('#summary-wrapper > div.summary h1').contents[0]).strip() - - subtitles = [] - # loop over episode rows - for subtitle in soup.select('div[id="subtitles"] tr[data-id]'): - # read common info - version = subtitle.find('td', {'class': 'name'}).text - download_link = subtitle.find('a', {'class': 'btn-success'})['href'].strip('\'') - - # read the episode info - if is_episode: - episode_numbers = soup.select_one('#summary-wrapper > div.container.summary span.main-title-sxe').text - season_num = None - episode_num = None - matches = episode_re.match(episode_numbers.strip()) - if matches: - season_num = int(matches.group(1)) - episode_num = int(matches.group(2)) - - episode_title = soup.select_one('#summary-wrapper > div.container.summary span.main-title').text - - subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, show_title, season_num, - episode_num, episode_title, year_num, version, download_link) - # read the movie info - else: - subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, None, None, None, show_title, - year_num, version, download_link) - - logger.debug('Found subtitle %r', subtitle) - subtitles.append(subtitle) - - return subtitles - - def list_subtitles(self, video, languages): - # lookup show_id - if isinstance(video, Episode): - titles = [video.series] + video.alternative_series - elif isinstance(video, Movie): - titles = [video.title] + video.alternative_titles - else: - titles = [] - - show_ids = None - for title in titles: - show_ids = self.get_show_ids(title, video.year, isinstance(video, Episode)) - if show_ids is not None and len(show_ids) > 0: - break - - subtitles = [] - # query for subtitles with the show_id - for show_id in show_ids: - if isinstance(video, Episode): - subtitles += [s for s in self.query(show_id, video.series, video.season, video.episode, video.title) - if s.language in languages and s.season == video.season and s.episode == video.episode] - elif isinstance(video, Movie): - subtitles += [s for s in self.query(show_id, None, None, None, video.title) - if s.language in languages and s.year == video.year] - - return subtitles - - def download_subtitle(self, subtitle): - if isinstance(subtitle, SubzSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - archive = _get_archive(r.content) - - subtitle_content = _get_subtitle_from_archive(archive) - if subtitle_content: - subtitle.content = fix_line_ending(subtitle_content) - else: - logger.debug('Could not extract subtitle from %r', archive) - - -def _get_archive(content): - # open the archive - archive_stream = io.BytesIO(content) - archive = None - if rarfile.is_rarfile(archive_stream): - logger.debug('Identified rar archive') - archive = rarfile.RarFile(archive_stream) - elif zipfile.is_zipfile(archive_stream): - logger.debug('Identified zip archive') - archive = zipfile.ZipFile(archive_stream) - - return archive - - -def _get_subtitle_from_archive(archive): - for name in archive.namelist(): - # discard hidden files - if os.path.split(name)[-1].startswith('.'): - continue - - # discard non-subtitle files - if not name.lower().endswith(SUBTITLE_EXTENSIONS): - continue - - return archive.read(name) - - return None diff --git a/libs/subliminal_patch/providers/xsubs.py b/libs/subliminal_patch/providers/xsubs.py deleted file mode 100644 index 102571dd9..000000000 --- a/libs/subliminal_patch/providers/xsubs.py +++ /dev/null @@ -1,302 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -import re - -from subzero.language import Language -from guessit import guessit -from requests import Session - -from subliminal.providers import ParserBeautifulSoup, Provider -from subliminal import __short_version__ -from subliminal.cache import SHOW_EXPIRATION_TIME, region -from subliminal.exceptions import AuthenticationError, ConfigurationError -from subliminal.score import get_equivalent_release_groups -from subliminal.subtitle import Subtitle, fix_line_ending, guess_matches -from subliminal.utils import sanitize, sanitize_release_group -from subliminal.video import Episode - -logger = logging.getLogger(__name__) -article_re = re.compile(r'^([A-Za-z]{1,3}) (.*)$') - - -class XSubsSubtitle(Subtitle): - """XSubs Subtitle.""" - provider_name = 'xsubs' - - def __init__(self, language, page_link, series, season, episode, year, title, version, download_link): - super(XSubsSubtitle, self).__init__(language, page_link=page_link) - self.series = series - self.season = season - self.episode = episode - self.year = year - self.title = title - self.version = version - self.download_link = download_link - self.hearing_impaired = None - self.encoding = 'windows-1253' - - @property - def id(self): - return self.download_link - - def get_matches(self, video): - matches = set() - - if isinstance(video, Episode): - # series name - if video.series and sanitize(self.series) in ( - sanitize(name) for name in [video.series] + video.alternative_series): - matches.add('series') - # season - if video.season and self.season == video.season: - matches.add('season') - # episode - if video.episode and self.episode == video.episode: - matches.add('episode') - # title of the episode - if video.title and sanitize(self.title) == sanitize(video.title): - matches.add('title') - # year - if video.original_series and self.year is None or video.year and video.year == self.year: - matches.add('year') - # release_group - if (video.release_group and self.version and - any(r in sanitize_release_group(self.version) - for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): - matches.add('release_group') - # other properties - matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) - - return matches - - -class XSubsProvider(Provider): - """XSubs Provider.""" - languages = {Language(l) for l in ['ell']} - video_types = (Episode,) - server_url = 'http://xsubs.tv' - sign_in_url = '/xforum/account/signin/' - sign_out_url = '/xforum/account/signout/' - all_series_url = '/series/all.xml' - series_url = '/series/{:d}/main.xml' - season_url = '/series/{show_id:d}/{season:d}.xml' - page_link = '/ice/xsw.xml?srsid={show_id:d}#{season_id:d};{season:d}' - download_link = '/xthru/getsub/{:d}' - subtitle_class = XSubsSubtitle - - def __init__(self, username=None, password=None): - if any((username, password)) and not all((username, password)): - raise ConfigurationError('Username and password must be specified') - - self.username = username - self.password = password - self.logged_in = False - self.session = None - - def initialize(self): - self.session = Session() - self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) - - # login - if self.username and self.password: - logger.info('Logging in') - self.session.get(self.server_url + self.sign_in_url) - data = {'username': self.username, - 'password': self.password, - 'csrfmiddlewaretoken': self.session.cookies['csrftoken']} - r = self.session.post(self.server_url + self.sign_in_url, data, allow_redirects=False, timeout=10) - - if r.status_code != 302: - raise AuthenticationError(self.username) - - logger.debug('Logged in') - self.logged_in = True - - def terminate(self): - # logout - if self.logged_in: - logger.info('Logging out') - r = self.session.get(self.server_url + self.sign_out_url, timeout=10) - r.raise_for_status() - logger.debug('Logged out') - self.logged_in = False - - self.session.close() - - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value) - def _get_show_ids(self): - # get the shows page - logger.info('Getting show ids') - r = self.session.get(self.server_url + self.all_series_url, timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - - # populate the show ids - show_ids = {} - for show_category in soup.findAll('seriesl'): - if show_category.attrs['category'] == u'Σειρές': - for show in show_category.findAll('series'): - show_ids[sanitize(show.text)] = int(show['srsid']) - break - logger.debug('Found %d show ids', len(show_ids)) - - return show_ids - - def get_show_id(self, series_names, year=None, country_code=None): - series_sanitized_names = [] - for name in series_names: - sanitized_name = sanitize(name) - series_sanitized_names.append(sanitized_name) - alternative_name = _get_alternative_name(sanitized_name) - if alternative_name: - series_sanitized_names.append(alternative_name) - - show_ids = self._get_show_ids() - show_id = None - - for series_sanitized in series_sanitized_names: - # attempt with country - if not show_id and country_code: - logger.debug('Getting show id with country') - show_id = show_ids.get('{series} {country}'.format(series=series_sanitized, - country=country_code.lower())) - - # attempt with year - if not show_id and year: - logger.debug('Getting show id with year') - show_id = show_ids.get('{series} {year:d}'.format(series=series_sanitized, year=year)) - - # attempt with article at the end - if not show_id and year: - logger.debug('Getting show id with year in brackets') - show_id = show_ids.get('{series} [{year:d}]'.format(series=series_sanitized, year=year)) - - # attempt clean - if not show_id: - logger.debug('Getting show id') - show_id = show_ids.get(series_sanitized) - - if show_id: - break - - return int(show_id) if show_id else None - - def query(self, show_id, series, season, year=None, country=None): - # get the season list of the show - logger.info('Getting the season list of show id %d', show_id) - r = self.session.get(self.server_url + self.series_url.format(show_id), timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - - series_title = soup.find('name').text - - # loop over season rows - seasons = soup.findAll('series_group') - season_id = None - - for season_row in seasons: - try: - parsed_season = int(season_row['ssnnum']) - if parsed_season == season: - season_id = int(season_row['ssnid']) - break - except (ValueError, TypeError): - continue - - if season_id is None: - logger.debug('Season not found in provider') - return [] - - # get the subtitle list of the season - logger.info('Getting the subtitle list of season %d', season) - r = self.session.get(self.server_url + self.season_url.format(show_id=show_id, season=season_id), timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('No data returned from provider') - return [] - - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - - subtitles = [] - # loop over episode rows - for episode in soup.findAll('subg'): - # read the episode info - etitle = episode.find('etitle') - if etitle is None: - continue - - episode_num = int(etitle['number'].split('-')[0]) - - sgt = episode.find('sgt') - if sgt is None: - continue - - season_num = int(sgt['ssnnum']) - - # filter out unreleased subtitles - for subtitle in episode.findAll('sr'): - if subtitle['published_on'] == '': - continue - - page_link = self.server_url + self.page_link.format(show_id=show_id, season_id=season_id, - season=season_num) - episode_title = etitle['title'] - version = subtitle.fmt.text + ' ' + subtitle.team.text - download_link = self.server_url + self.download_link.format(int(subtitle['rlsid'])) - - subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, series_title, season_num, - episode_num, year, episode_title, version, download_link) - logger.debug('Found subtitle %r', subtitle) - subtitles.append(subtitle) - - return subtitles - - def list_subtitles(self, video, languages): - if isinstance(video, Episode): - # lookup show_id - titles = [video.series] + video.alternative_series - show_id = self.get_show_id(titles, video.year) - - # query for subtitles with the show_id - if show_id: - subtitles = [s for s in self.query(show_id, video.series, video.season, video.year) - if s.language in languages and s.season == video.season and s.episode == video.episode] - if subtitles: - return subtitles - else: - logger.error('No show id found for %r (%r)', video.series, {'year': video.year}) - - return [] - - def download_subtitle(self, subtitle): - if isinstance(subtitle, XSubsSubtitle): - # download the subtitle - logger.info('Downloading subtitle %r', subtitle) - r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, - timeout=10) - r.raise_for_status() - - if not r.content: - logger.debug('Unable to download subtitle. No data returned from provider') - return - - subtitle.content = fix_line_ending(r.content) - - -def _get_alternative_name(series): - article_match = article_re.match(series) - if article_match: - return '{series} {article}'.format(series=article_match.group(2), article=article_match.group(1)) - - return None diff --git a/libs/subliminal_patch/refiners/omdb.py b/libs/subliminal_patch/refiners/omdb.py index 9ecb5155b..bef212f75 100644 --- a/libs/subliminal_patch/refiners/omdb.py +++ b/libs/subliminal_patch/refiners/omdb.py @@ -4,7 +4,7 @@ import subliminal import base64 import zlib from subliminal import __short_version__ -from subliminal.refiners.omdb import OMDBClient, refine +from subliminal.refiners.omdb import OMDBClient, refine as refine_orig, Episode, Movie class SZOMDBClient(OMDBClient): @@ -63,5 +63,13 @@ class SZOMDBClient(OMDBClient): return j +def refine(video, **kwargs): + refine_orig(video, **kwargs) + if isinstance(video, Episode) and video.series_imdb_id: + video.series_imdb_id = video.series_imdb_id.strip() + elif isinstance(video, Movie) and video.imdb_id: + video.imdb_id = video.imdb_id.strip() + + omdb_client = SZOMDBClient(headers={'User-Agent': 'Subliminal/%s' % __short_version__}) subliminal.refiners.omdb.omdb_client = omdb_client From afb2a868106090434af4eb5744cf50ab1b8ef70d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sat, 6 Apr 2019 08:26:42 -0400 Subject: [PATCH 09/19] Continuing development. --- libs/deathbycaptcha.py | 516 ++++++++++++++++++++++++++ libs/python_anticaptcha/__init__.py | 7 + libs/python_anticaptcha/base.py | 114 ++++++ libs/python_anticaptcha/exceptions.py | 23 ++ libs/python_anticaptcha/fields.py | 199 ++++++++++ libs/python_anticaptcha/proxy.py | 28 ++ libs/python_anticaptcha/tasks.py | 128 +++++++ libs/subliminal_patch/pitcher.py | 212 +++++++++++ 8 files changed, 1227 insertions(+) create mode 100644 libs/deathbycaptcha.py create mode 100644 libs/python_anticaptcha/__init__.py create mode 100644 libs/python_anticaptcha/base.py create mode 100644 libs/python_anticaptcha/exceptions.py create mode 100644 libs/python_anticaptcha/fields.py create mode 100644 libs/python_anticaptcha/proxy.py create mode 100644 libs/python_anticaptcha/tasks.py create mode 100644 libs/subliminal_patch/pitcher.py diff --git a/libs/deathbycaptcha.py b/libs/deathbycaptcha.py new file mode 100644 index 000000000..3c2fafb77 --- /dev/null +++ b/libs/deathbycaptcha.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +"""Death by Captcha HTTP and socket API clients. + +There are two types of Death by Captcha (DBC hereinafter) API: HTTP and +socket ones. Both offer the same functionalily, with the socket API +sporting faster responses and using way less connections. + +To access the socket API, use SocketClient class; for the HTTP API, use +HttpClient class. Both are thread-safe. SocketClient keeps a persistent +connection opened and serializes all API requests sent through it, thus +it is advised to keep a pool of them if you're script is heavily +multithreaded. + +Both SocketClient and HttpClient give you the following methods: + +get_user() + Returns your DBC account details as a dict with the following keys: + + "user": your account numeric ID; if login fails, it will be the only + item with the value of 0; + "rate": your CAPTCHA rate, i.e. how much you will be charged for one + solved CAPTCHA in US cents; + "balance": your DBC account balance in US cents; + "is_banned": flag indicating whether your account is suspended or not. + +get_balance() + Returns your DBC account balance in US cents. + +get_captcha(cid) + Returns an uploaded CAPTCHA details as a dict with the following keys: + + "captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will + be the only item with the value of 0; + "text": the CAPTCHA text, if solved, otherwise None; + "is_correct": flag indicating whether the CAPTCHA was solved correctly + (DBC can detect that in rare cases). + + The only argument `cid` is the CAPTCHA numeric ID. + +get_text(cid) + Returns an uploaded CAPTCHA text (None if not solved). The only argument + `cid` is the CAPTCHA numeric ID. + +report(cid) + Reports an incorrectly solved CAPTCHA. The only argument `cid` is the + CAPTCHA numeric ID. Returns True on success, False otherwise. + +upload(captcha) + Uploads a CAPTCHA. The only argument `captcha` can be either file-like + object (any object with `read` method defined, actually, so StringIO + will do), or CAPTCHA image file name. On successul upload you'll get + the CAPTCHA details dict (see get_captcha() method). + + NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have + to poll for its status periodically using get_captcha() or get_text() + method until the CAPTCHA is solved and you get the text. + +decode(captcha, timeout=DEFAULT_TIMEOUT) + A convenient method that uploads a CAPTCHA and polls for its status + periodically, but no longer than `timeout` (defaults to 60 seconds). + If solved, you'll get the CAPTCHA details dict (see get_captcha() + method for details). See upload() method for details on `captcha` + argument. + +Visit http://www.deathbycaptcha.com/user/api for updates. + +""" + +import base64 +import binascii +import errno +import imghdr +import random +import os +import select +import socket +import sys +import threading +import time +import urllib +import urllib2 +try: + from json import read as json_decode, write as json_encode +except ImportError: + try: + from json import loads as json_decode, dumps as json_encode + except ImportError: + from simplejson import loads as json_decode, dumps as json_encode + + +# API version and unique software ID +API_VERSION = 'DBC/Python v4.6' + +# Default CAPTCHA timeout and decode() polling interval +DEFAULT_TIMEOUT = 60 +DEFAULT_TOKEN_TIMEOUT = 120 +POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2] +DFLT_POLL_INTERVAL = 3 + +# Base HTTP API url +HTTP_BASE_URL = 'http://api.dbcapi.me/api' + +# Preferred HTTP API server's response content type, do not change +HTTP_RESPONSE_TYPE = 'application/json' + +# Socket API server's host & ports range +SOCKET_HOST = 'api.dbcapi.me' +SOCKET_PORTS = range(8123, 8131) + + +def _load_image(captcha): + if hasattr(captcha, 'read'): + img = captcha.read() + elif type(captcha) == bytearray: + img = captcha + else: + img = '' + try: + captcha_file = open(captcha, 'rb') + except Exception: + raise + else: + img = captcha_file.read() + captcha_file.close() + if not len(img): + raise ValueError('CAPTCHA image is empty') + elif imghdr.what(None, img) is None: + raise TypeError('Unknown CAPTCHA image type') + else: + return img + + +class AccessDeniedException(Exception): + pass + + +class Client(object): + + """Death by Captcha API Client.""" + + def __init__(self, username, password): + self.is_verbose = False + self.userpwd = {'username': username, 'password': password} + + def _log(self, cmd, msg=''): + if self.is_verbose: + print '%d %s %s' % (time.time(), cmd, msg.rstrip()) + return self + + def close(self): + pass + + def connect(self): + pass + + def get_user(self): + """Fetch user details -- ID, balance, rate and banned status.""" + raise NotImplementedError() + + def get_balance(self): + """Fetch user balance (in US cents).""" + return self.get_user().get('balance') + + def get_captcha(self, cid): + """Fetch a CAPTCHA details -- ID, text and correctness flag.""" + raise NotImplementedError() + + def get_text(self, cid): + """Fetch a CAPTCHA text.""" + return self.get_captcha(cid).get('text') or None + + def report(self, cid): + """Report a CAPTCHA as incorrectly solved.""" + raise NotImplementedError() + + def upload(self, captcha): + """Upload a CAPTCHA. + + Accepts file names and file-like objects. Returns CAPTCHA details + dict on success. + + """ + raise NotImplementedError() + + def decode(self, captcha=None, timeout=None, **kwargs): + """ + Try to solve a CAPTCHA. + + See Client.upload() for arguments details. + + Uploads a CAPTCHA, polls for its status periodically with arbitrary + timeout (in seconds), returns CAPTCHA details if (correctly) solved. + """ + if not timeout: + if not captcha: + timeout = DEFAULT_TOKEN_TIMEOUT + else: + timeout = DEFAULT_TIMEOUT + + deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT) + uploaded_captcha = self.upload(captcha, **kwargs) + if uploaded_captcha: + intvl_idx = 0 # POLL_INTERVAL index + while deadline > time.time() and not uploaded_captcha.get('text'): + intvl, intvl_idx = self._get_poll_interval(intvl_idx) + time.sleep(intvl) + pulled = self.get_captcha(uploaded_captcha['captcha']) + if pulled['captcha'] == uploaded_captcha['captcha']: + uploaded_captcha = pulled + if uploaded_captcha.get('text') and \ + uploaded_captcha.get('is_correct'): + return uploaded_captcha + + def _get_poll_interval(self, idx): + """Returns poll interval and next index depending on index provided""" + + if len(POLLS_INTERVAL) > idx: + intvl = POLLS_INTERVAL[idx] + else: + intvl = DFLT_POLL_INTERVAL + idx += 1 + + return intvl, idx + + +class HttpClient(Client): + + """Death by Captcha HTTP API client.""" + + def __init__(self, *args): + Client.__init__(self, *args) + self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) + + def _call(self, cmd, payload=None, headers=None): + if headers is None: + headers = {} + headers['Accept'] = HTTP_RESPONSE_TYPE + headers['User-Agent'] = API_VERSION + if hasattr(payload, 'items'): + payload = urllib.urlencode(payload) + self._log('SEND', '%s %d %s' % (cmd, len(payload), payload)) + else: + self._log('SEND', '%s' % cmd) + if payload is not None: + headers['Content-Length'] = len(payload) + try: + response = self.opener.open(urllib2.Request( + HTTP_BASE_URL + '/' + cmd.strip('/'), + data=payload, + headers=headers + )).read() + except urllib2.HTTPError, err: + if 403 == err.code: + raise AccessDeniedException('Access denied, please check' + ' your credentials and/or balance') + elif 400 == err.code or 413 == err.code: + raise ValueError("CAPTCHA was rejected by the service, check" + " if it's a valid image") + elif 503 == err.code: + raise OverflowError("CAPTCHA was rejected due to service" + " overload, try again later") + else: + raise err + else: + self._log('RECV', '%d %s' % (len(response), response)) + try: + return json_decode(response) + except Exception: + raise RuntimeError('Invalid API response') + return {} + + def get_user(self): + return self._call('user', self.userpwd.copy()) or {'user': 0} + + def get_captcha(self, cid): + return self._call('captcha/%d' % cid) or {'captcha': 0} + + def report(self, cid): + return not self._call('captcha/%d/report' % cid, + self.userpwd.copy()).get('is_correct') + + def upload(self, captcha=None, **kwargs): + boundary = binascii.hexlify(os.urandom(16)) + banner = kwargs.get('banner', '') + if banner: + kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner)) + body = '\r\n'.join(('\r\n'.join(( + '--%s' % boundary, + 'Content-Disposition: form-data; name="%s"' % k, + 'Content-Type: text/plain', + 'Content-Length: %d' % len(str(v)), + '', + str(v) + ))) for k, v in self.userpwd.items()) + + body += '\r\n'.join(('\r\n'.join(( + '--%s' % boundary, + 'Content-Disposition: form-data; name="%s"' % k, + 'Content-Type: text/plain', + 'Content-Length: %d' % len(str(v)), + '', + str(v) + ))) for k, v in kwargs.items()) + + if captcha: + img = _load_image(captcha) + body += '\r\n'.join(( + '', + '--%s' % boundary, + 'Content-Disposition: form-data; name="captchafile"; ' + 'filename="captcha"', + 'Content-Type: application/octet-stream', + 'Content-Length: %d' % len(img), + '', + img, + '--%s--' % boundary, + '' + )) + + response = self._call('captcha', body, { + 'Content-Type': 'multipart/form-data; boundary="%s"' % boundary + }) or {} + if response.get('captcha'): + return response + + +class SocketClient(Client): + + """Death by Captcha socket API client.""" + + TERMINATOR = '\r\n' + + def __init__(self, *args): + Client.__init__(self, *args) + self.socket_lock = threading.Lock() + self.socket = None + + def close(self): + if self.socket: + self._log('CLOSE') + try: + self.socket.shutdown(socket.SHUT_RDWR) + except socket.error: + pass + finally: + self.socket.close() + self.socket = None + + def connect(self): + if not self.socket: + self._log('CONN') + host = (socket.gethostbyname(SOCKET_HOST), + random.choice(SOCKET_PORTS)) + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.socket.settimeout(0) + try: + self.socket.connect(host) + except socket.error, err: + if (err.args[0] not in + (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): + self.close() + raise err + return self.socket + + def __del__(self): + self.close() + + def _sendrecv(self, sock, buf): + self._log('SEND', buf) + fds = [sock] + buf += self.TERMINATOR + response = '' + intvl_idx = 0 + while True: + intvl, intvl_idx = self._get_poll_interval(intvl_idx) + rds, wrs, exs = select.select((not buf and fds) or [], + (buf and fds) or [], + fds, + intvl) + if exs: + raise IOError('select() failed') + try: + if wrs: + while buf: + buf = buf[wrs[0].send(buf):] + elif rds: + while True: + s = rds[0].recv(256) + if not s: + raise IOError('recv(): connection lost') + else: + response += s + except socket.error, err: + if (err.args[0] not in + (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): + raise err + if response.endswith(self.TERMINATOR): + self._log('RECV', response) + return response.rstrip(self.TERMINATOR) + raise IOError('send/recv timed out') + + def _call(self, cmd, data=None): + if data is None: + data = {} + data['cmd'] = cmd + data['version'] = API_VERSION + request = json_encode(data) + + response = None + for _ in range(2): + if not self.socket and cmd != 'login': + self._call('login', self.userpwd.copy()) + self.socket_lock.acquire() + try: + sock = self.connect() + response = self._sendrecv(sock, request) + except IOError, err: + sys.stderr.write(str(err) + "\n") + self.close() + except socket.error, err: + sys.stderr.write(str(err) + "\n") + self.close() + raise IOError('Connection refused') + else: + break + finally: + self.socket_lock.release() + + if response is None: + raise IOError('Connection lost or timed out during API request') + + try: + response = json_decode(response) + except Exception: + raise RuntimeError('Invalid API response') + + if not response.get('error'): + return response + + error = response['error'] + if error in ('not-logged-in', 'invalid-credentials'): + raise AccessDeniedException('Access denied, check your credentials') + elif 'banned' == error: + raise AccessDeniedException('Access denied, account is suspended') + elif 'insufficient-funds' == error: + raise AccessDeniedException( + 'CAPTCHA was rejected due to low balance') + elif 'invalid-captcha' == error: + raise ValueError('CAPTCHA is not a valid image') + elif 'service-overload' == error: + raise OverflowError( + 'CAPTCHA was rejected due to service overload, try again later') + else: + self.socket_lock.acquire() + self.close() + self.socket_lock.release() + raise RuntimeError('API server error occured: %s' % error) + + def get_user(self): + return self._call('user') or {'user': 0} + + def get_captcha(self, cid): + return self._call('captcha', {'captcha': cid}) or {'captcha': 0} + + def upload(self, captcha=None, **kwargs): + data = {} + if captcha: + data['captcha'] = base64.b64encode(_load_image(captcha)) + if kwargs: + banner = kwargs.get('banner', '') + if banner: + kwargs['banner'] = base64.b64encode(_load_image(banner)) + data.update(kwargs) + response = self._call('upload', data) + if response.get('captcha'): + uploaded_captcha = dict( + (k, response.get(k)) + for k in ('captcha', 'text', 'is_correct') + ) + if not uploaded_captcha['text']: + uploaded_captcha['text'] = None + return uploaded_captcha + + def report(self, cid): + return not self._call('report', {'captcha': cid}).get('is_correct') + + +if '__main__' == __name__: + # Put your DBC username & password here: + # client = HttpClient(sys.argv[1], sys.argv[2]) + client = SocketClient(sys.argv[1], sys.argv[2]) + client.is_verbose = True + + print 'Your balance is %s US cents' % client.get_balance() + + for fn in sys.argv[3:]: + try: + # Put your CAPTCHA image file name or file-like object, and optional + # solving timeout (in seconds) here: + captcha = client.decode(fn, DEFAULT_TIMEOUT) + except Exception, e: + sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, )) + captcha = None + + if captcha: + print 'CAPTCHA %d solved: %s' % \ + (captcha['captcha'], captcha['text']) + + # Report as incorrectly solved if needed. Make sure the CAPTCHA was + # in fact incorrectly solved! + # try: + # client.report(captcha['captcha']) + # except Exception, e: + # sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, )) diff --git a/libs/python_anticaptcha/__init__.py b/libs/python_anticaptcha/__init__.py new file mode 100644 index 000000000..ac9f0550f --- /dev/null +++ b/libs/python_anticaptcha/__init__.py @@ -0,0 +1,7 @@ +from .base import AnticaptchaClient +from .tasks import NoCaptchaTask, NoCaptchaTaskProxylessTask, ImageToTextTask, FunCaptchaTask +from .proxy import Proxy +from .exceptions import AnticaptchaException +from .fields import SimpleText, Image, WebLink, TextInput, Textarea, Checkbox, Select, Radio, ImageUpload + +AnticatpchaException = AnticaptchaException \ No newline at end of file diff --git a/libs/python_anticaptcha/base.py b/libs/python_anticaptcha/base.py new file mode 100644 index 000000000..fca8cdf53 --- /dev/null +++ b/libs/python_anticaptcha/base.py @@ -0,0 +1,114 @@ +import requests +import time + +from six.moves.urllib_parse import urljoin +from .exceptions import AnticaptchaException + +SLEEP_EVERY_CHECK_FINISHED = 3 +MAXIMUM_JOIN_TIME = 60 * 5 + + +class Job(object): + client = None + task_id = None + _last_result = None + + def __init__(self, client, task_id): + self.client = client + self.task_id = task_id + + def _update(self): + self._last_result = self.client.getTaskResult(self.task_id) + + def check_is_ready(self): + self._update() + return self._last_result['status'] == 'ready' + + def get_solution_response(self): # Recaptcha + return self._last_result['solution']['gRecaptchaResponse'] + + def get_token_response(self): # Funcaptcha + return self._last_result['solution']['token'] + + def get_answers(self): + return self._last_result['solution']['answers'] + + def get_captcha_text(self): # Image + return self._last_result['solution']['text'] + + def report_incorrect(self): + return self.client.reportIncorrectImage(self.task_id) + + def join(self, maximum_time=None): + elapsed_time = 0 + maximum_time = maximum_time or MAXIMUM_JOIN_TIME + while not self.check_is_ready(): + time.sleep(SLEEP_EVERY_CHECK_FINISHED) + elapsed_time += SLEEP_EVERY_CHECK_FINISHED + if elapsed_time is not None and elapsed_time > maximum_time: + raise AnticaptchaException(None, 250, + "The execution time exceeded a maximum time of {} seconds. It takes {} seconds.".format( + maximum_time, elapsed_time)) + + +class AnticaptchaClient(object): + client_key = None + CREATE_TASK_URL = "/createTask" + TASK_RESULT_URL = "/getTaskResult" + BALANCE_URL = "/getBalance" + REPORT_IMAGE_URL = "/reportIncorrectImageCaptcha" + SOFT_ID = 847 + language_pool = "en" + + def __init__(self, client_key, language_pool="en", host="api.anti-captcha.com", use_ssl=True): + self.client_key = client_key + self.language_pool = language_pool + self.base_url = "{proto}://{host}/".format(proto="https" if use_ssl else "http", + host=host) + self.session = requests.Session() + + @property + def client_ip(self): + if not hasattr(self, '_client_ip'): + self._client_ip = self.session.get('http://httpbin.org/ip').json()['origin'] + return self._client_ip + + def _check_response(self, response): + if response.get('errorId', False) == 11: + response['errorDescription'] = "{} Your missing IP address is {}.".format(response['errorDescription'], + self.client_ip) + if response.get('errorId', False): + raise AnticaptchaException(response['errorId'], + response['errorCode'], + response['errorDescription']) + + def createTask(self, task): + request = {"clientKey": self.client_key, + "task": task.serialize(), + "softId": self.SOFT_ID, + "languagePool": self.language_pool, + } + response = self.session.post(urljoin(self.base_url, self.CREATE_TASK_URL), json=request).json() + self._check_response(response) + return Job(self, response['taskId']) + + def getTaskResult(self, task_id): + request = {"clientKey": self.client_key, + "taskId": task_id} + response = self.session.post(urljoin(self.base_url, self.TASK_RESULT_URL), json=request).json() + self._check_response(response) + return response + + def getBalance(self): + request = {"clientKey": self.client_key} + response = self.session.post(urljoin(self.base_url, self.BALANCE_URL), json=request).json() + self._check_response(response) + return response['balance'] + + def reportIncorrectImage(self, task_id): + request = {"clientKey": self.client_key, + "taskId": task_id + } + response = self.session.post(urljoin(self.base_url, self.REPORT_IMAGE_URL), json=request).json() + self._check_response(response) + return response.get('status', False) != False diff --git a/libs/python_anticaptcha/exceptions.py b/libs/python_anticaptcha/exceptions.py new file mode 100644 index 000000000..f37eb372c --- /dev/null +++ b/libs/python_anticaptcha/exceptions.py @@ -0,0 +1,23 @@ +class AnticaptchaException(Exception): + def __init__(self, error_id, error_code, error_description, *args): + super(AnticaptchaException, self).__init__("[{}:{}]{}".format(error_code, error_id, error_description)) + self.error_description = error_description + self.error_id = error_id + self.error_code = error_code + + +AnticatpchaException = AnticaptchaException + + +class InvalidWidthException(AnticaptchaException): + def __init__(self, width): + self.width = width + msg = 'Invalid width (%s). Can be one of these: 100, 50, 33, 25.' % (self.width,) + super(InvalidWidthException, self).__init__("AC-1", 1, msg) + + +class MissingNameException(AnticaptchaException): + def __init__(self, cls): + self.cls = cls + msg = 'Missing name data in {0}. Provide {0}.__init__(name="X") or {0}.serialize(name="X")'.format(str(self.cls)) + super(MissingNameException, self).__init__("AC-2", 2, msg) diff --git a/libs/python_anticaptcha/fields.py b/libs/python_anticaptcha/fields.py new file mode 100644 index 000000000..9e6245946 --- /dev/null +++ b/libs/python_anticaptcha/fields.py @@ -0,0 +1,199 @@ +import six +from python_anticaptcha.exceptions import InvalidWidthException, MissingNameException + + +class BaseField(object): + label = None + labelHint = None + + def serialize(self, name=None): + data = {} + if self.label: + data['label'] = self.label or False + if self.labelHint: + data['labelHint'] = self.labelHint or False + return data + + +class NameBaseField(BaseField): + name = None + + def serialize(self, name=None): + data = super(NameBaseField, self).serialize(name) + if name: + data['name'] = name + elif self.name: + data['name'] = self.name + else: + raise MissingNameException(cls=self.__class__) + return data + + +class SimpleText(BaseField): + contentType = 'text' + + def __init__(self, content, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.content = content + self.width = width + + def serialize(self, name=None): + data = super(SimpleText, self).serialize(name) + data['contentType'] = self.contentType + data['content'] = self.content + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + data['inputOptions'] = {} + data['width'] = self.width + return data + + +class Image(BaseField): + contentType = 'image' + + def __init__(self, imageUrl, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + self.imageUrl = imageUrl + + def serialize(self, name=None): + data = super(Image, self).serialize(name) + data['contentType'] = self.contentType + data['content'] = self.imageUrl + return data + + +class WebLink(BaseField): + contentType = 'link' + + def __init__(self, linkText, linkUrl, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.linkText = linkText + self.linkUrl = linkUrl + + self.width = width + + def serialize(self, name=None): + data = super(WebLink, self).serialize(name) + data['contentType'] = self.contentType + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + data['inputOptions'] = {} + data['width'] = self.width + + data.update({'content': {'url': self.linkUrl, + 'text': self.linkText}}) + + return data + + +class TextInput(NameBaseField): + def __init__(self, placeHolder=None, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.placeHolder = placeHolder + + self.width = width + + def serialize(self, name=None): + data = super(TextInput, self).serialize(name) + data['inputType'] = 'text' + + data['inputOptions'] = {} + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + + data['inputOptions']['width'] = str(self.width) + + if self.placeHolder: + data['inputOptions']['placeHolder'] = self.placeHolder + return data + + +class Textarea(NameBaseField): + def __init__(self, placeHolder=None, rows=None, label=None, width=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + self.placeHolder = placeHolder + self.rows = rows + self.width = width + + def serialize(self, name=None): + data = super(Textarea, self).serialize(name) + data['inputType'] = 'textarea' + data['inputOptions'] = {} + if self.rows: + data['inputOptions']['rows'] = str(self.rows) + if self.placeHolder: + data['inputOptions']['placeHolder'] = self.placeHolder + if self.width: + data['inputOptions']['width'] = str(self.width) + return data + + +class Checkbox(NameBaseField): + def __init__(self, text, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + self.text = text + + def serialize(self, name=None): + data = super(Checkbox, self).serialize(name) + data['inputType'] = 'checkbox' + data['inputOptions'] = {'label': self.text} + return data + + +class Select(NameBaseField): + type = 'select' + + def __init__(self, label=None, choices=None, labelHint=None): + self.label = label + self.labelHint = labelHint + self.choices = choices or () + + def get_choices(self): + for choice in self.choices: + if isinstance(choice, six.text_type): + yield choice, choice + else: + yield choice + + def serialize(self, name=None): + data = super(Select, self).serialize(name) + data['inputType'] = self.type + + data['inputOptions'] = [] + for value, caption in self.get_choices(): + data['inputOptions'].append({"value": value, + "caption": caption}) + + return data + + +class Radio(Select): + type = 'radio' + + +class ImageUpload(NameBaseField): + def __init__(self, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + def serialize(self, name=None): + data = super(ImageUpload, self).serialize(name) + data['inputType'] = 'imageUpload' + return data diff --git a/libs/python_anticaptcha/proxy.py b/libs/python_anticaptcha/proxy.py new file mode 100644 index 000000000..907232f7e --- /dev/null +++ b/libs/python_anticaptcha/proxy.py @@ -0,0 +1,28 @@ +from six.moves.urllib_parse import urlparse + + +class Proxy(object): + def __init__(self, proxy_type, proxy_address, proxy_port, proxy_login, proxy_password): + self.proxyType = proxy_type + self.proxyAddress = proxy_address + self.proxyPort = proxy_port + self.proxyLogin = proxy_login + self.proxyPassword = proxy_password + + def serialize(self): + result = {'proxyType': self.proxyType, + 'proxyAddress': self.proxyAddress, + 'proxyPort': self.proxyPort} + if self.proxyLogin or self.proxyPassword: + result['proxyLogin'] = self.proxyLogin + result['proxyPassword'] = self.proxyPassword + return result + + @classmethod + def parse_url(cls, url): + parsed = urlparse(url) + return cls(proxy_type=parsed.scheme, + proxy_address=parsed.hostname, + proxy_port=parsed.port, + proxy_login=parsed.username, + proxy_password=parsed.password) diff --git a/libs/python_anticaptcha/tasks.py b/libs/python_anticaptcha/tasks.py new file mode 100644 index 000000000..57462763f --- /dev/null +++ b/libs/python_anticaptcha/tasks.py @@ -0,0 +1,128 @@ +import base64 +from .fields import BaseField + + +class BaseTask(object): + def serialize(self, **result): + return result + + +class ProxyMixin(BaseTask): + def __init__(self, *args, **kwargs): + self.proxy = kwargs.pop('proxy') + self.userAgent = kwargs.pop('user_agent') + self.cookies = kwargs.pop('cookies', '') + super(ProxyMixin, self).__init__(*args, **kwargs) + + def serialize(self, **result): + result = super(ProxyMixin, self).serialize(**result) + result.update(self.proxy.serialize()) + result['userAgent'] = self.userAgent + if self.cookies: + result['cookies'] = self.cookies + return result + + +class NoCaptchaTaskProxylessTask(BaseTask): + type = "NoCaptchaTaskProxyless" + websiteURL = None + websiteKey = None + websiteSToken = None + + def __init__(self, website_url, website_key, website_s_token=None, is_invisible=None): + self.websiteURL = website_url + self.websiteKey = website_key + self.websiteSToken = website_s_token + self.isInvisible = is_invisible + + def serialize(self): + data = {'type': self.type, + 'websiteURL': self.websiteURL, + 'websiteKey': self.websiteKey} + if self.websiteSToken is not None: + data['websiteSToken'] = self.websiteSToken + if self.isInvisible is not None: + data['isInvisible'] = self.isInvisible + return data + + +class FunCaptchaTask(ProxyMixin): + type = "FunCaptchaTask" + websiteURL = None + websiteKey = None + + def __init__(self, website_url, website_key, *args, **kwargs): + self.websiteURL = website_url + self.websiteKey = website_key + super(FunCaptchaTask, self).__init__(*args, **kwargs) + + def serialize(self, **result): + result = super(FunCaptchaTask, self).serialize(**result) + result.update({'type': self.type, + 'websiteURL': self.websiteURL, + 'websitePublicKey': self.websiteKey}) + return result + + +class NoCaptchaTask(ProxyMixin, NoCaptchaTaskProxylessTask): + type = "NoCaptchaTask" + + +class ImageToTextTask(object): + type = "ImageToTextTask" + fp = None + phrase = None + case = None + numeric = None + math = None + minLength = None + maxLength = None + + def __init__(self, fp, phrase=None, case=None, numeric=None, math=None, min_length=None, max_length=None): + self.fp = fp + self.phrase = phrase + self.case = case + self.numeric = numeric + self.math = math + self.minLength = min_length + self.maxLength = max_length + + def serialize(self): + return {'type': self.type, + 'body': base64.b64encode(self.fp.read()).decode('utf-8'), + 'phrase': self.phrase, + 'case': self.case, + 'numeric': self.numeric, + 'math': self.math, + 'minLength': self.minLength, + 'maxLength': self.maxLength} + + +class CustomCaptchaTask(BaseTask): + type = 'CustomCaptchaTask' + imageUrl = None + assignment = None + form = None + + def __init__(self, imageUrl, form=None, assignment=None): + self.imageUrl = imageUrl + self.form = form or {} + self.assignment = assignment + + def serialize(self): + data = super(CustomCaptchaTask, self).serialize() + data.update({'type': self.type, + 'imageUrl': self.imageUrl}) + if self.form: + forms = [] + for name, field in self.form.items(): + if isinstance(field, BaseField): + forms.append(field.serialize(name)) + else: + field = field.copy() + field['name'] = name + forms.append(field) + data['forms'] = forms + if self.assignment: + data['assignment'] = self.assignment + return data diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py new file mode 100644 index 000000000..12be90384 --- /dev/null +++ b/libs/subliminal_patch/pitcher.py @@ -0,0 +1,212 @@ +# coding=utf-8 + +import time +import logging +import json +import requests +from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ + Proxy +from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT + + +logger = logging.getLogger(__name__) + + +class PitcherRegistry(object): + pitchers = {} + + def register(self, cls): + self.pitchers[cls.name] = cls + return cls + + def get_pitcher(self, name): + return self.pitchers[name] + + +registry = pitchers = PitcherRegistry() + + +class Pitcher(object): + name = None + tries = 3 + job = None + client = None + website_url = None + website_key = None + website_name = None + solve_time = None + success = False + + def __init__(self, website_name, website_url, website_key, tries=3, *args, **kwargs): + self.tries = tries + self.website_name = website_name + self.website_key = website_key + self.website_url = website_url + self.success = False + self.solve_time = None + + def get_client(self): + raise NotImplementedError + + def get_job(self): + raise NotImplementedError + + def _throw(self): + self.client = self.get_client() + self.job = self.get_job() + + def throw(self): + t = time.time() + data = self._throw() + if self.success: + self.solve_time = time.time() - t + logger.info("%s: Solving took %ss", self.website_name, int(self.solve_time)) + return data + + +@registry.register +class AntiCaptchaProxyLessPitcher(Pitcher): + name = "AntiCaptchaProxyLess" + host = "api.anti-captcha.com" + language_pool = "en" + client_key = None + use_ssl = True + is_invisible = False + + def __init__(self, website_name, client_key, website_url, website_key, tries=3, host=None, language_pool=None, + use_ssl=True, is_invisible=False, *args, **kwargs): + super(AntiCaptchaProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries, *args, + **kwargs) + self.client_key = client_key + self.host = host or self.host + self.language_pool = language_pool or self.language_pool + self.use_ssl = use_ssl + self.is_invisible = is_invisible + + def get_client(self): + return AnticaptchaClient(self.client_key, self.language_pool, self.host, self.use_ssl) + + def get_job(self): + task = NoCaptchaTaskProxylessTask(website_url=self.website_url, website_key=self.website_key, + is_invisible=self.is_invisible) + return self.client.createTask(task) + + def _throw(self): + for i in range(self.tries): + try: + super(AntiCaptchaProxyLessPitcher, self)._throw() + self.job.join() + ret = self.job.get_solution_response() + if ret: + self.success = True + return ret + except AnticaptchaException as e: + if i >= self.tries - 1: + logger.error("%s: Captcha solving finally failed. Exiting", self.website_name) + return + + if e.error_code == 'ERROR_ZERO_BALANCE': + logger.error("%s: No balance left on captcha solving service. Exiting", self.website_name) + return + + elif e.error_code == 'ERROR_NO_SLOT_AVAILABLE': + logger.info("%s: No captcha solving slot available, retrying", self.website_name) + time.sleep(5.0) + continue + + elif e.error_code == 'ERROR_KEY_DOES_NOT_EXIST': + logger.error("%s: Bad AntiCaptcha API key", self.website_name) + return + + elif e.error_id is None and e.error_code == 250: + # timeout + if i < self.tries: + logger.info("%s: Captcha solving timed out, retrying", self.website_name) + time.sleep(1.0) + continue + else: + logger.error("%s: Captcha solving timed out three times; bailing out", self.website_name) + return + raise + + +@registry.register +class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher): + name = "AntiCaptcha" + proxy = None + user_agent = None + cookies = None + + def __init__(self, *args, **kwargs): + self.proxy = Proxy.parse_url(kwargs.pop("proxy")) + print self.proxy.__dict__ + self.user_agent = kwargs.pop("user_agent") + cookies = kwargs.pop("cookies", {}) + if isinstance(cookies, dict): + self.cookies = ";".join(["%s=%s" % (k, v) for k, v in cookies.iteritems()]) + + super(AntiCaptchaPitcher, self).__init__(*args, **kwargs) + + def get_job(self): + task = NoCaptchaTask(website_url=self.website_url, website_key=self.website_key, proxy=self.proxy, + user_agent=self.user_agent, cookies=self.cookies, is_invisible=self.is_invisible) + return self.client.createTask(task) + + +@registry.register +class DBCProxyLessPitcher(Pitcher): + name = "DeathByCaptchaProxyLess" + username = None + password = None + + def __init__(self, website_name, client_key, website_url, website_key, + timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs): + super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries) + + self.username, self.password = client_key.split(":", 1) + self.timeout = timeout + + def get_client(self): + return DBCClient(self.username, self.password) + + def get_job(self): + pass + + @property + def payload_dict(self): + return { + "googlekey": self.website_key, + "pageurl": self.website_url + } + + def _throw(self): + super(DBCProxyLessPitcher, self)._throw() + payload = json.dumps(self.payload_dict) + try: + #balance = self.client.get_balance() + data = self.client.decode(timeout=self.timeout, type=4, token_params=payload) + if data and data["is_correct"]: + self.success = True + return data["text"] + except: + raise + + +@registry.register +class DBCPitcher(DBCProxyLessPitcher): + proxy = None + proxy_type = "HTTP" + + def __init__(self, *args, **kwargs): + self.proxy = kwargs.pop("proxy") + super(DBCPitcher, self).__init__(*args, **kwargs) + + @property + def payload_dict(self): + payload = super(DBCPitcher, self).payload_dict + payload.update({ + "proxytype": self.proxy_type, + "proxy": self.proxy + }) + return payload + From 18b9ce814c0a8309934054600168a0681bd073ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sat, 6 Apr 2019 23:13:42 -0400 Subject: [PATCH 10/19] Fix for form validation error when upgrade subs is off but number of days is empty. Added dependency. --- views/settings.tpl | 1 + 1 file changed, 1 insertion(+) diff --git a/views/settings.tpl b/views/settings.tpl index 29072aeff..90028c4be 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -2616,6 +2616,7 @@ ] }, settings_days_to_upgrade_subs : { + depends: 'settings_upgrade_subs', rules : [ { type : 'integer[1..30]' From 6c101282f9cbb7782fa088dfb3bb5f0a6eecb6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 16:09:17 -0400 Subject: [PATCH 11/19] Continuing development. --- bazarr/config.py | 13 +++++- bazarr/init.py | 3 -- views/settings.tpl | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 5 deletions(-) diff --git a/bazarr/config.py b/bazarr/config.py index 2d3405f04..db2afd358 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -41,7 +41,8 @@ defaults = { 'subfolder_custom': '', 'upgrade_subs': 'True', 'days_to_upgrade_subs': '7', - 'upgrade_manual': 'True' + 'upgrade_manual': 'True', + 'anti_captcha_provider': 'anti-captcha' }, 'auth': { 'type': 'None', @@ -98,7 +99,15 @@ defaults = { }, 'assrt': { 'token': '' - }} + }, + 'anticaptcha': { + 'anti_captcha_key': '' + }, + 'deathbycaptcha': { + 'username': '', + 'password': '' + } +} settings = simpleconfigparser(defaults=defaults) settings.read(os.path.join(args.config_dir, 'config', 'config.ini')) diff --git a/bazarr/init.py b/bazarr/init.py index 7c13cc24d..eb3af0ce3 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -17,9 +17,6 @@ from get_args import args # set subliminal_patch user agent os.environ["SZ_USER_AGENT"] = "Bazarr/1" -# set anticaptcha account key -os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.general.anticaptcha_key - # Check if args.config_dir exist if not os.path.exists(args.config_dir): # Create config_dir directory tree diff --git a/views/settings.tpl b/views/settings.tpl index 29072aeff..f6068244e 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -1228,6 +1228,78 @@
+ +
Anti-captcha options
+
+
+
+
+ +
+
+ +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+
+
+
Subtitles providers
@@ -2235,6 +2307,30 @@ } }); + if ($('#settings_anti_captcha_provider').val() === "None") { + $('.anticaptcha').hide(); + $('.deathbycaptcha').hide(); + } else if ($('#settings_anti_captcha_provider').val() === "anti-captcha") { + $('.anticaptcha').show(); + $('.deathbycaptcha').hide(); + } else if ($('#settings_anti_captcha_provider').val() === "death-by-cCaptcha") { + $('.deathbycaptcha').show(); + $('.anticaptcha').hide(); + } + + $('#settings_anti_captcha_provider').dropdown('setting', 'onChange', function(){ + if ($('#settings_anti_captcha_provider').val() === "None") { + $('.anticaptcha').hide(); + $('.deathbycaptcha').hide(); + } else if ($('#settings_anti_captcha_provider').val() === "anti-captcha") { + $('.anticaptcha').show(); + $('.deathbycaptcha').hide(); + } else if ($('#settings_anti_captcha_provider').val() === "death-by-captcha") { + $('.deathbycaptcha').show(); + $('.anticaptcha').hide(); + } + }); + if ($('#settings_use_postprocessing').data("postprocessing") === "True") { $('.postprocessing').show(); } else { @@ -2445,6 +2541,8 @@ $('#settings_page_size').dropdown('set selected','{{!settings.general.page_size}}'); $('#settings_subfolder').dropdown('clear'); $('#settings_subfolder').dropdown('set selected', '{{!settings.general.subfolder}}'); + $('#settings_anti_captcha_provider').dropdown('clear'); + $('#settings_anti_captcha_provider').dropdown('set selected', '{{!settings.general.anti_captcha_provider}}'); $('#settings_proxy_type').dropdown('clear'); $('#settings_proxy_type').dropdown('set selected','{{!settings.proxy.type}}'); $('#settings_providers').dropdown('clear'); From 24833509369b68200819fab4b6a6500e263be80f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 16:10:11 -0400 Subject: [PATCH 12/19] Continuing development. --- bazarr/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bazarr/config.py b/bazarr/config.py index db2afd358..07a8d6965 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -42,7 +42,7 @@ defaults = { 'upgrade_subs': 'True', 'days_to_upgrade_subs': '7', 'upgrade_manual': 'True', - 'anti_captcha_provider': 'anti-captcha' + 'anti_captcha_provider': 'None' }, 'auth': { 'type': 'None', From f5d9a868a109715750ac8fc8aaac64e066310111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 17:54:31 -0400 Subject: [PATCH 13/19] Continuing development. --- bazarr/main.py | 8 ++++++++ views/settings.tpl | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bazarr/main.py b/bazarr/main.py index ea3e4414a..90e5d7931 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -1275,6 +1275,10 @@ def save_settings(): settings_upgrade_manual = 'False' else: settings_upgrade_manual = 'True' + settings_anti_captcha_provider = request.forms.get('settings_anti_captcha_provider') + settings_anti_captcha_key = request.forms.get('settings_anti_captcha_key') + settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username') + settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password') before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url), unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')), @@ -1306,6 +1310,10 @@ def save_settings(): settings.general.upgrade_subs = text_type(settings_upgrade_subs) settings.general.days_to_upgrade_subs = text_type(settings_days_to_upgrade_subs) settings.general.upgrade_manual = text_type(settings_upgrade_manual) + settings.general.anti_captcha_provider = text_type(settings_anti_captcha_provider) + settings.anticaptcha.anti_captcha_key = text_type(settings_anti_captcha_key) + settings.deathbycaptcha.username = text_type(settings_death_by_captcha_username) + settings.deathbycaptcha.password = text_type(settings_death_by_captcha_password) settings.general.minimum_score_movie = text_type(settings_general_minimum_score_movies) settings.general.use_embedded_subs = text_type(settings_general_embedded) settings.general.adaptive_searching = text_type(settings_general_adaptive_searching) diff --git a/views/settings.tpl b/views/settings.tpl index 29760a479..d3af01107 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -1305,7 +1305,7 @@
- +
@@ -1775,7 +1775,7 @@
- +
From 8aef7bc0d3575d5eae50e5cae7bf9c81d9d846de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 19:31:39 -0400 Subject: [PATCH 14/19] Continuing development. --- libs/subliminal_patch/http.py | 6 +- libs/subliminal_patch/pitcher.py | 85 +++-- libs/subliminal_patch/providers/addic7ed.py | 62 ++-- .../providers/greeksubtitles.py | 184 ++++++++++ libs/subliminal_patch/providers/subs4free.py | 283 ++++++++++++++++ .../subliminal_patch/providers/subs4series.py | 272 +++++++++++++++ libs/subliminal_patch/providers/subssabbz.py | 159 +++++++++ libs/subliminal_patch/providers/subsunacs.py | 161 +++++++++ libs/subliminal_patch/providers/subz.py | 318 ++++++++++++++++++ libs/subliminal_patch/providers/titlovi.py | 239 +++++++------ libs/subliminal_patch/providers/xsubs.py | 302 +++++++++++++++++ 11 files changed, 1907 insertions(+), 164 deletions(-) create mode 100644 libs/subliminal_patch/providers/greeksubtitles.py create mode 100644 libs/subliminal_patch/providers/subs4free.py create mode 100644 libs/subliminal_patch/providers/subs4series.py create mode 100644 libs/subliminal_patch/providers/subssabbz.py create mode 100644 libs/subliminal_patch/providers/subsunacs.py create mode 100644 libs/subliminal_patch/providers/subz.py create mode 100644 libs/subliminal_patch/providers/xsubs.py diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index 465d5555e..c813f5585 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -61,8 +61,7 @@ class CertifiSession(CloudflareScraper): cache_key = "cf_data_%s" % domain - if not self.cookies.get("__cfduid", "", domain=domain) or not self.cookies.get("cf_clearance", "", - domain=domain): + if not self.cookies.get("__cfduid", "", domain=domain): cf_data = region.get(cache_key) if cf_data is not NO_VALUE: cf_cookies, user_agent = cf_data @@ -78,7 +77,8 @@ class CertifiSession(CloudflareScraper): except: pass else: - if cf_data != region.get(cache_key): + if cf_data != region.get(cache_key) and self.cookies.get("__cfduid", "", domain=domain)\ + and self.cookies.get("cf_clearance", "", domain=domain): logger.debug("Storing cf data for %s: %s", domain, cf_data) region.set(cache_key, cf_data) diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py index 12be90384..b2cef63b3 100644 --- a/libs/subliminal_patch/pitcher.py +++ b/libs/subliminal_patch/pitcher.py @@ -1,9 +1,11 @@ # coding=utf-8 +import os import time import logging import json -import requests +from subliminal.cache import region +from dogpile.cache.api import NO_VALUE from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ Proxy from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT @@ -13,14 +15,29 @@ logger = logging.getLogger(__name__) class PitcherRegistry(object): - pitchers = {} + pitchers = [] + pitchers_by_key = {} def register(self, cls): - self.pitchers[cls.name] = cls + idx = len(self.pitchers) + self.pitchers.append(cls) + key = "%s_%s" % (cls.name, cls.needs_proxy) + key_by_source = "%s_%s" % (cls.source, cls.needs_proxy) + self.pitchers_by_key[key] = idx + self.pitchers_by_key[key_by_source] = idx return cls - def get_pitcher(self, name): - return self.pitchers[name] + def get_pitcher(self, name_or_site=None, with_proxy=False): + name_or_site = name_or_site or os.environ.get("ANTICAPTCHA_CLASS") + if not name_or_site: + raise Exception("AntiCaptcha class not given, exiting") + + key = "%s_%s" % (name_or_site, with_proxy) + + if key not in self.pitchers_by_key: + raise Exception("Pitcher %s not found (proxy: %s)" % (name_or_site, with_proxy)) + + return self.pitchers[self.pitchers_by_key.get(key)] registry = pitchers = PitcherRegistry() @@ -28,17 +45,24 @@ registry = pitchers = PitcherRegistry() class Pitcher(object): name = None + source = None + needs_proxy = False tries = 3 job = None client = None + client_key = None website_url = None website_key = None website_name = None solve_time = None success = False - def __init__(self, website_name, website_url, website_key, tries=3, *args, **kwargs): + def __init__(self, website_name, website_url, website_key, tries=3, client_key=None, *args, **kwargs): self.tries = tries + self.client_key = client_key or os.environ.get("ANTICAPTCHA_ACCOUNT_KEY") + if not self.client_key: + raise Exception("AntiCaptcha key not given, exiting") + self.website_name = website_name self.website_key = website_key self.website_url = website_url @@ -67,17 +91,17 @@ class Pitcher(object): @registry.register class AntiCaptchaProxyLessPitcher(Pitcher): name = "AntiCaptchaProxyLess" + source = "anti-captcha.com" host = "api.anti-captcha.com" language_pool = "en" - client_key = None + tries = 5 use_ssl = True is_invisible = False - def __init__(self, website_name, client_key, website_url, website_key, tries=3, host=None, language_pool=None, + def __init__(self, website_name, website_url, website_key, tries=3, host=None, language_pool=None, use_ssl=True, is_invisible=False, *args, **kwargs): super(AntiCaptchaProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries, *args, **kwargs) - self.client_key = client_key self.host = host or self.host self.language_pool = language_pool or self.language_pool self.use_ssl = use_ssl @@ -134,12 +158,12 @@ class AntiCaptchaProxyLessPitcher(Pitcher): class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher): name = "AntiCaptcha" proxy = None + needs_proxy = True user_agent = None cookies = None def __init__(self, *args, **kwargs): self.proxy = Proxy.parse_url(kwargs.pop("proxy")) - print self.proxy.__dict__ self.user_agent = kwargs.pop("user_agent") cookies = kwargs.pop("cookies", {}) if isinstance(cookies, dict): @@ -156,14 +180,15 @@ class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher): @registry.register class DBCProxyLessPitcher(Pitcher): name = "DeathByCaptchaProxyLess" + source = "deathbycaptcha.com" username = None password = None - def __init__(self, website_name, client_key, website_url, website_key, + def __init__(self, website_name, website_url, website_key, timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs): super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries) - self.username, self.password = client_key.split(":", 1) + self.username, self.password = self.client_key.split(":", 1) self.timeout = timeout def get_client(self): @@ -182,19 +207,22 @@ class DBCProxyLessPitcher(Pitcher): def _throw(self): super(DBCProxyLessPitcher, self)._throw() payload = json.dumps(self.payload_dict) - try: - #balance = self.client.get_balance() - data = self.client.decode(timeout=self.timeout, type=4, token_params=payload) - if data and data["is_correct"]: - self.success = True - return data["text"] - except: - raise + for i in range(self.tries): + try: + #balance = self.client.get_balance() + data = self.client.decode(timeout=self.timeout, type=4, token_params=payload) + if data and data["is_correct"] and data["text"]: + self.success = True + return data["text"] + except: + raise @registry.register class DBCPitcher(DBCProxyLessPitcher): + name = "DeathByCaptcha" proxy = None + needs_proxy = True proxy_type = "HTTP" def __init__(self, *args, **kwargs): @@ -210,3 +238,20 @@ class DBCPitcher(DBCProxyLessPitcher): }) return payload + +def load_verification(site_name, session, callback=lambda x: None): + ccks = region.get("%s_data" % site_name, expiration_time=15552000) # 6m + if ccks != NO_VALUE: + cookies, user_agent = ccks + logger.debug("%s: Re-using previous user agent: %s", site_name.capitalize(), user_agent) + session.headers["User-Agent"] = user_agent + try: + session.cookies._cookies.update(cookies) + return callback(region) + except: + return False + return False + + +def store_verification(site_name, session): + region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"])) diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index 086343e98..2d556d877 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -1,24 +1,20 @@ # coding=utf-8 import logging import re -import os import datetime import subliminal import time -import requests from random import randint -from dogpile.cache.api import NO_VALUE from requests import Session -from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException -from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError +from subliminal.cache import region +from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError from subliminal.providers.addic7ed import Addic7edProvider as _Addic7edProvider, \ Addic7edSubtitle as _Addic7edSubtitle, ParserBeautifulSoup, show_cells_re -from subliminal.cache import region from subliminal.subtitle import fix_line_ending from subliminal_patch.utils import sanitize from subliminal_patch.exceptions import TooManyRequests -from subliminal_patch.pitcher import pitchers +from subliminal_patch.pitcher import pitchers, load_verification, store_verification from subzero.language import Language logger = logging.getLogger(__name__) @@ -86,24 +82,19 @@ class Addic7edProvider(_Addic7edProvider): # login if self.username and self.password: - ccks = region.get("addic7ed_data", expiration_time=15552000) # 6m - if ccks != NO_VALUE: - cookies, user_agent = ccks - logger.debug("Addic7ed: Re-using previous user agent") - self.session.headers["User-Agent"] = user_agent - try: - self.session.cookies._cookies.update(cookies) - r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, - headers={"Referer": self.server_url}) - if r.status_code == 302: - logger.info('Addic7ed: Login expired') - region.delete("addic7ed_data") - else: - logger.info('Addic7ed: Re-using old login') - self.logged_in = True - return - except: - pass + def check_verification(cache_region): + rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, + headers={"Referer": self.server_url}) + if rr.status_code == 302: + logger.info('Addic7ed: Login expired') + cache_region.delete("addic7ed_data") + else: + logger.info('Addic7ed: Re-using old login') + self.logged_in = True + return True + + if load_verification("addic7ed", self.session, callback=check_verification): + return logger.info('Addic7ed: Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', @@ -115,25 +106,16 @@ class Addic7edProvider(_Addic7edProvider): if "grecaptcha" in r.content: logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') - anticaptcha_key = os.environ.get("ANTICAPTCHA_ACCOUNT_KEY") - if not anticaptcha_key: - logger.error("AntiCaptcha key not given, exiting") - return - - anticaptcha_proxy = os.environ.get("ANTICAPTCHA_PROXY") site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return - #pitcher_cls = pitchers.get_pitcher("AntiCaptchaProxyLess") - #pitcher = pitcher_cls("Addic7ed", anticaptcha_key, self.server_url + 'login.php', site_key) - pitcher_cls = pitchers.get_pitcher("AntiCaptchaProxyLess") - pitcher = pitcher_cls("Addic7ed", anticaptcha_key, self.server_url + 'login.php', site_key, - user_agent=self.session.headers["User-Agent"], - cookies=self.session.cookies.get_dict(), - is_invisible=True) + pitcher = pitchers.get_pitcher()("Addic7ed", self.server_url + 'login.php', site_key, + user_agent=self.session.headers["User-Agent"], + cookies=self.session.cookies.get_dict(), + is_invisible=True) result = pitcher.throw() if not result: @@ -156,13 +138,13 @@ class Addic7edProvider(_Addic7edProvider): raise AuthenticationError(self.username) break - region.set("addic7ed_data", (self.session.cookies._cookies, self.session.headers["User-Agent"])) + store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True def terminate(self): - pass + self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): diff --git a/libs/subliminal_patch/providers/greeksubtitles.py b/libs/subliminal_patch/providers/greeksubtitles.py new file mode 100644 index 000000000..98dfc289e --- /dev/null +++ b/libs/subliminal_patch/providers/greeksubtitles.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- +import io +import logging +import os +import zipfile + +import rarfile +from subzero.language import Language +from guessit import guessit +from requests import Session +from six import text_type + +from subliminal import __short_version__ +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.video import Episode, Movie + +logger = logging.getLogger(__name__) + + +class GreekSubtitlesSubtitle(Subtitle): + """GreekSubtitles Subtitle.""" + provider_name = 'greeksubtitles' + + def __init__(self, language, page_link, version, download_link): + super(GreekSubtitlesSubtitle, self).__init__(language, page_link=page_link) + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'windows-1253' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + # episode + if isinstance(video, Episode): + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) + # movie + elif isinstance(video, Movie): + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) + + return matches + + +class GreekSubtitlesProvider(Provider): + """GreekSubtitles Provider.""" + languages = {Language(l) for l in ['ell', 'eng']} + server_url = 'http://gr.greek-subtitles.com/' + search_url = 'search.php?name={}' + download_url = 'http://www.greeksubtitles.info/getp.php?id={:d}' + subtitle_class = GreekSubtitlesSubtitle + + def __init__(self): + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + def terminate(self): + self.session.close() + + def query(self, keyword, season=None, episode=None, year=None): + params = keyword + if season and episode: + params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) + elif year: + params += ' {:4d}'.format(year) + + logger.debug('Searching subtitles %r', params) + subtitles = [] + search_link = self.server_url + text_type(self.search_url).format(params) + while True: + r = self.session.get(search_link, timeout=30) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) + + # loop over subtitles cells + for cell in soup.select('td.latest_name > a:nth-of-type(1)'): + # read the item + subtitle_id = int(cell['href'].rsplit('/', 2)[1]) + page_link = cell['href'] + language = Language.fromalpha2(cell.parent.find('img')['src'].split('/')[-1].split('.')[0]) + version = cell.text.strip() or None + if version is None: + version = "" + + subtitle = self.subtitle_class(language, page_link, version, self.download_url.format(subtitle_id)) + + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + anchors = soup.select('td a') + next_page_available = False + for anchor in anchors: + if 'Next' in anchor.text and 'search.php' in anchor['href']: + search_link = self.server_url + anchor['href'] + next_page_available = True + break + if not next_page_available: + break + + return subtitles + + def list_subtitles(self, video, languages): + if isinstance(video, Episode): + titles = [video.series] + video.alternative_series + elif isinstance(video, Movie): + titles = [video.title] + video.alternative_titles + else: + titles = [] + + subtitles = [] + # query for subtitles with the show_id + for title in titles: + if isinstance(video, Episode): + subtitles += [s for s in self.query(title, season=video.season, episode=video.episode, + year=video.year) + if s.language in languages] + elif isinstance(video, Movie): + subtitles += [s for s in self.query(title, year=video.year) + if s.language in languages] + + return subtitles + + def download_subtitle(self, subtitle): + if isinstance(subtitle, GreekSubtitlesSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, + timeout=30) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + archive = _get_archive(r.content) + + subtitle_content = _get_subtitle_from_archive(archive) + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug('Could not extract subtitle from %r', archive) + + +def _get_archive(content): + # open the archive + archive_stream = io.BytesIO(content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive = rarfile.RarFile(archive_stream) + elif zipfile.is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive = zipfile.ZipFile(archive_stream) + + return archive + + +def _get_subtitle_from_archive(archive): + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + return archive.read(name) + + return None diff --git a/libs/subliminal_patch/providers/subs4free.py b/libs/subliminal_patch/providers/subs4free.py new file mode 100644 index 000000000..181b99351 --- /dev/null +++ b/libs/subliminal_patch/providers/subs4free.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- +# encoding=utf8 +import io +import logging +import os +import random + +import rarfile +import re +import zipfile + +from subzero.language import Language +from guessit import guessit +from requests import Session +from six import text_type + +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal import __short_version__ +from subliminal.cache import SHOW_EXPIRATION_TIME, region +from subliminal.score import get_equivalent_release_groups +from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.utils import sanitize, sanitize_release_group +from subliminal.video import Movie + +logger = logging.getLogger(__name__) + +year_re = re.compile(r'^\((\d{4})\)$') + + +class Subs4FreeSubtitle(Subtitle): + """Subs4Free Subtitle.""" + provider_name = 'subs4free' + + def __init__(self, language, page_link, title, year, version, download_link): + super(Subs4FreeSubtitle, self).__init__(language, page_link=page_link) + self.title = title + self.year = year + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'utf8' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + # movie + if isinstance(video, Movie): + # title + if video.title and (sanitize(self.title) in ( + sanitize(name) for name in [video.title] + video.alternative_titles)): + matches.add('title') + # year + if video.year and self.year == video.year: + matches.add('year') + + # release_group + if (video.release_group and self.version and + any(r in sanitize_release_group(self.version) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): + matches.add('release_group') + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) + + return matches + + +class Subs4FreeProvider(Provider): + """Subs4Free Provider.""" + languages = {Language(l) for l in ['ell', 'eng']} + video_types = (Movie,) + server_url = 'https://www.sf4-industry.com' + download_url = '/getSub.html' + search_url = '/search_report.php?search={}&searchType=1' + subtitle_class = Subs4FreeSubtitle + + def __init__(self): + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + def terminate(self): + self.session.close() + + def get_show_ids(self, title, year=None): + """Get the best matching show id for `series` and `year``. + + First search in the result of :meth:`_get_show_suggestions`. + + :param title: show title. + :param year: year of the show, if any. + :type year: int + :return: the show id, if found. + :rtype: str + + """ + title_sanitized = sanitize(title).lower() + show_ids = self._get_suggestions(title) + + matched_show_ids = [] + for show in show_ids: + show_id = None + show_title = sanitize(show['title']) + # attempt with year + if not show_id and year: + logger.debug('Getting show id with year') + show_id = show['link'].split('?p=')[-1] if show_title == '{title} {year:d}'.format( + title=title_sanitized, year=year) else None + + # attempt clean + if not show_id: + logger.debug('Getting show id') + show_id = show['link'].split('?p=')[-1] if show_title == title_sanitized else None + + if show_id: + matched_show_ids.append(show_id) + + return matched_show_ids + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, + should_cache_fn=lambda value: value) + def _get_suggestions(self, title): + """Search the show or movie id from the `title` and `year`. + + :param str title: title of the show. + :return: the show suggestions found. + :rtype: dict + + """ + # make the search + logger.info('Searching show ids with %r', title) + r = self.session.get(self.server_url + text_type(self.search_url).format(title), + headers={'Referer': self.server_url}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return {} + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + suggestions = [{'link': l.attrs['value'], 'title': l.text} + for l in soup.select('select[name="Mov_sel"] > option[value]')] + logger.debug('Found suggestions: %r', suggestions) + + return suggestions + + def query(self, movie_id, title, year): + # get the season list of the show + logger.info('Getting the subtitle list of show id %s', movie_id) + if movie_id: + page_link = self.server_url + '/' + movie_id + else: + page_link = self.server_url + text_type(self.search_url).format(' '.join([title, str(year)])) + + r = self.session.get(page_link, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['html.parser']) + + year_num = None + year_element = soup.select_one('td#dates_header > table div') + matches = False + if year_element: + matches = year_re.match(str(year_element.contents[2]).strip()) + if matches: + year_num = int(matches.group(1)) + + title_element = soup.select_one('td#dates_header > table u') + show_title = str(title_element.contents[0]).strip() if title_element else None + + subtitles = [] + # loop over episode rows + for subtitle in soup.select('table.table_border div[align="center"] > div'): + # read common info + version = subtitle.find('b').text + download_link = self.server_url + subtitle.find('a')['href'] + language = Language.fromalpha2(subtitle.find('img')['src'].split('/')[-1].split('.')[0]) + + subtitle = self.subtitle_class(language, page_link, show_title, year_num, version, download_link) + + logger.debug('Found subtitle {!r}'.format(subtitle)) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + # lookup show_id + titles = [video.title] + video.alternative_titles if isinstance(video, Movie) else [] + + show_ids = None + for title in titles: + show_ids = self.get_show_ids(title, video.year) + if show_ids and len(show_ids) > 0: + break + + subtitles = [] + # query for subtitles with the show_id + if show_ids and len(show_ids) > 0: + for show_id in show_ids: + subtitles += [s for s in self.query(show_id, video.title, video.year) if s.language in languages] + else: + subtitles += [s for s in self.query(None, video.title, video.year) if s.language in languages] + + return subtitles + + def download_subtitle(self, subtitle): + if isinstance(subtitle, Subs4FreeSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + download_element = soup.select_one('input[name="id"]') + image_element = soup.select_one('input[type="image"]') + subtitle_id = download_element['value'] if download_element else None + width = int(str(image_element['width']).strip('px')) if image_element else 0 + height = int(str(image_element['height']).strip('px')) if image_element else 0 + + if not subtitle_id: + logger.debug('Unable to download subtitle. No download link found') + return + + download_url = self.server_url + self.download_url + r = self.session.post(download_url, data={'utf8': 1, 'id': subtitle_id, 'x': random.randint(0, width), + 'y': random.randint(0, height)}, + headers={'Referer': subtitle.download_link}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + archive = _get_archive(r.content) + + subtitle_content = _get_subtitle_from_archive(archive) if archive else r.content + + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug('Could not extract subtitle from %r', archive) + + +def _get_archive(content): + # open the archive + archive_stream = io.BytesIO(content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive = rarfile.RarFile(archive_stream) + elif zipfile.is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive = zipfile.ZipFile(archive_stream) + + return archive + + +def _get_subtitle_from_archive(archive): + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + return archive.read(name) + + return None diff --git a/libs/subliminal_patch/providers/subs4series.py b/libs/subliminal_patch/providers/subs4series.py new file mode 100644 index 000000000..5f381feeb --- /dev/null +++ b/libs/subliminal_patch/providers/subs4series.py @@ -0,0 +1,272 @@ +# -*- coding: utf-8 -*- +import io +import logging +import os + +import rarfile +import re +import zipfile + +from subzero.language import Language +from guessit import guessit +from requests import Session +from six import text_type + +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal import __short_version__ +from subliminal.cache import SHOW_EXPIRATION_TIME, region +from subliminal.score import get_equivalent_release_groups +from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.utils import sanitize, sanitize_release_group +from subliminal.video import Episode + +logger = logging.getLogger(__name__) + +year_re = re.compile(r'^\((\d{4})\)$') + + +class Subs4SeriesSubtitle(Subtitle): + """Subs4Series Subtitle.""" + provider_name = 'subs4series' + + def __init__(self, language, page_link, series, year, version, download_link): + super(Subs4SeriesSubtitle, self).__init__(language, page_link=page_link) + self.series = series + self.year = year + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'windows-1253' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + # episode + if isinstance(video, Episode): + # series name + if video.series and sanitize(self.series) in ( + sanitize(name) for name in [video.series] + video.alternative_series): + matches.add('series') + # year + if video.original_series and self.year is None or video.year and video.year == self.year: + matches.add('year') + + # release_group + if (video.release_group and self.version and + any(r in sanitize_release_group(self.version) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): + matches.add('release_group') + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) + + return matches + + +class Subs4SeriesProvider(Provider): + """Subs4Series Provider.""" + languages = {Language(l) for l in ['ell', 'eng']} + video_types = (Episode,) + server_url = 'https://www.subs4series.com' + search_url = '/search_report.php?search={}&searchType=1' + episode_link = '/tv-series/{show_id}/season-{season:d}/episode-{episode:d}' + subtitle_class = Subs4SeriesSubtitle + + def __init__(self): + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + def terminate(self): + self.session.close() + + def get_show_ids(self, title, year=None): + """Get the best matching show id for `series` and `year`. + + First search in the result of :meth:`_get_show_suggestions`. + + :param title: show title. + :param year: year of the show, if any. + :type year: int + :return: the show id, if found. + :rtype: str + + """ + title_sanitized = sanitize(title).lower() + show_ids = self._get_suggestions(title) + + matched_show_ids = [] + for show in show_ids: + show_id = None + show_title = sanitize(show['title']) + # attempt with year + if not show_id and year: + logger.debug('Getting show id with year') + show_id = '/'.join(show['link'].rsplit('/', 2)[1:]) if show_title == '{title} {year:d}'.format( + title=title_sanitized, year=year) else None + + # attempt clean + if not show_id: + logger.debug('Getting show id') + show_id = '/'.join(show['link'].rsplit('/', 2)[1:]) if show_title == title_sanitized else None + + if show_id: + matched_show_ids.append(show_id) + + return matched_show_ids + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, + should_cache_fn=lambda value: value) + def _get_suggestions(self, title): + """Search the show or movie id from the `title` and `year`. + + :param str title: title of the show. + :return: the show suggestions found. + :rtype: dict + + """ + # make the search + logger.info('Searching show ids with %r', title) + r = self.session.get(self.server_url + text_type(self.search_url).format(title), + headers={'Referer': self.server_url}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return {} + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + series = [{'link': l.attrs['value'], 'title': l.text} + for l in soup.select('select[name="Mov_sel"] > option[value]')] + logger.debug('Found suggestions: %r', series) + + return series + + def query(self, show_id, series, season, episode, title): + # get the season list of the show + logger.info('Getting the subtitle list of show id %s', show_id) + if all((show_id, season, episode)): + page_link = self.server_url + self.episode_link.format(show_id=show_id, season=season, episode=episode) + else: + return [] + + r = self.session.get(page_link, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + year_num = None + matches = year_re.match(str(soup.select_one('#dates_header_br > table div').contents[2]).strip()) + if matches: + year_num = int(matches.group(1)) + show_title = str(soup.select_one('#dates_header_br > table u').contents[0]).strip() + + subtitles = [] + # loop over episode rows + for subtitle in soup.select('table.table_border div[align="center"] > div'): + # read common info + version = subtitle.find('b').text + download_link = self.server_url + subtitle.find('a')['href'] + language = Language.fromalpha2(subtitle.find('img')['src'].split('/')[-1].split('.')[0]) + + subtitle = self.subtitle_class(language, page_link, show_title, year_num, version, download_link) + + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + # lookup show_id + titles = [video.series] + video.alternative_series if isinstance(video, Episode) else [] + + show_ids = None + for title in titles: + show_ids = self.get_show_ids(title, video.year) + if show_ids and len(show_ids) > 0: + break + + subtitles = [] + # query for subtitles with the show_id + for show_id in show_ids: + subtitles += [s for s in self.query(show_id, video.series, video.season, video.episode, video.title) + if s.language in languages] + + return subtitles + + def download_subtitle(self, subtitle): + if isinstance(subtitle, Subs4SeriesSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + download_element = soup.select_one('a.style55ws') + if not download_element: + download_element = soup.select_one('form[method="post"]') + target = download_element['action'] if download_element else None + else: + target = download_element['href'] + + if not target: + logger.debug('Unable to download subtitle. No download link found') + return + + download_url = self.server_url + target + r = self.session.get(download_url, headers={'Referer': subtitle.download_link}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + archive = _get_archive(r.content) + subtitle_content = _get_subtitle_from_archive(archive) if archive else r.content + + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug('Could not extract subtitle from %r', archive) + + +def _get_archive(content): + # open the archive + archive_stream = io.BytesIO(content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive = rarfile.RarFile(archive_stream) + elif zipfile.is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive = zipfile.ZipFile(archive_stream) + + return archive + + +def _get_subtitle_from_archive(archive): + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + return archive.read(name) + + return None diff --git a/libs/subliminal_patch/providers/subssabbz.py b/libs/subliminal_patch/providers/subssabbz.py new file mode 100644 index 000000000..d3d138884 --- /dev/null +++ b/libs/subliminal_patch/providers/subssabbz.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +import logging +import re +import io +import os +from random import randint +from bs4 import BeautifulSoup +from zipfile import ZipFile, is_zipfile +from rarfile import RarFile, is_rarfile +from requests import Session +from guessit import guessit +from subliminal_patch.providers import Provider +from subliminal_patch.subtitle import Subtitle +from subliminal_patch.utils import sanitize +from subliminal.exceptions import ProviderError +from subliminal.utils import sanitize_release_group +from subliminal.subtitle import guess_matches +from subliminal.video import Episode, Movie +from subliminal.subtitle import fix_line_ending +from subzero.language import Language +from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST + +logger = logging.getLogger(__name__) + +class SubsSabBzSubtitle(Subtitle): + """SubsSabBz Subtitle.""" + provider_name = 'subssabbz' + + def __init__(self, langauge, filename, type): + super(SubsSabBzSubtitle, self).__init__(langauge) + self.langauge = langauge + self.filename = filename + self.type = type + + @property + def id(self): + return self.filename + + def get_matches(self, video): + matches = set() + + video_filename = video.name + video_filename = os.path.basename(video_filename) + video_filename, _ = os.path.splitext(video_filename) + video_filename = sanitize_release_group(video_filename) + + subtitle_filename = self.filename + subtitle_filename = os.path.basename(subtitle_filename) + subtitle_filename, _ = os.path.splitext(subtitle_filename) + subtitle_filename = sanitize_release_group(subtitle_filename) + + if video_filename == subtitle_filename: + matches.add('hash') + + matches |= guess_matches(video, guessit(self.filename, {'type': self.type})) + + matches.add(id(self)) + return matches + + +class SubsSabBzProvider(Provider): + """SubsSabBz Provider.""" + languages = {Language('por', 'BR')} | {Language(l) for l in [ + 'bul', 'eng' + ]} + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + self.session.headers["Accept-Language"] = "en-US,en;q=0.5" + self.session.headers["Accept-Encoding"] = "gzip, deflate, br" + self.session.headers["DNT"] = "1" + self.session.headers["Connection"] = "keep-alive" + self.session.headers["Upgrade-Insecure-Requests"] = "1" + self.session.headers["Cache-Control"] = "max-age=0" + + def terminate(self): + self.session.close() + + def query(self, language, video): + subtitles = [] + isEpisode = isinstance(video, Episode) + + params = { + 'act': 'search', + 'movie': '', + 'select-language': '2', + 'upldr': '', + 'yr': '', + 'release': '' + } + + if isEpisode: + params['movie'] = "%s %02d %02d" % (sanitize(video.series), video.season, video.episode) + else: + params['yr'] = video.year + params['movie'] = (video.title) + + if language == 'en' or language == 'eng': + params['select-language'] = 1 + + logger.info('Searching subtitle %r', params) + response = self.session.post('http://subs.sab.bz/index.php?', params=params, allow_redirects=False, timeout=10, headers={ + 'Referer': 'http://subs.sab.bz/', + }) + + response.raise_for_status() + + if response.status_code != 200: + logger.debug('No subtitles found') + return subtitles + + soup = BeautifulSoup(response.content, 'html.parser') + rows = soup.findAll('tr', {'class': 'subs-row'}) + + # Search on first 10 rows only + for row in rows[:10]: + a_element_wrapper = row.find('td', { 'class': 'c2field' }) + if a_element_wrapper: + element = a_element_wrapper.find('a') + if element: + link = element.get('href') + logger.info('Found subtitle link %r', link) + subtitles = subtitles + self.download_archive_and_add_subtitle_files(link, language, video) + + return subtitles + + def list_subtitles(self, video, languages): + return [s for l in languages for s in self.query(l, video)] + + def download_subtitle(self, subtitle): + pass + + def process_archive_subtitle_files(self, archiveStream, language, video): + subtitles = [] + type = 'episode' if isinstance(video, Episode) else 'movie' + for file_name in archiveStream.namelist(): + if file_name.lower().endswith(('.srt', '.sub')): + logger.info('Found subtitle file %r', file_name) + subtitle = SubsSabBzSubtitle(language, file_name, type) + subtitle.content = archiveStream.read(file_name) + subtitles.append(subtitle) + return subtitles + + def download_archive_and_add_subtitle_files(self, link, language, video ): + logger.info('Downloading subtitle %r', link) + request = self.session.get(link, headers={ + 'Referer': 'http://subs.sab.bz/index.php?' + }) + request.raise_for_status() + + archive_stream = io.BytesIO(request.content) + if is_rarfile(archive_stream): + return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) + elif is_zipfile(archive_stream): + return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) + else: + raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/subsunacs.py b/libs/subliminal_patch/providers/subsunacs.py new file mode 100644 index 000000000..bbc41f520 --- /dev/null +++ b/libs/subliminal_patch/providers/subsunacs.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +import logging +import re +import io +import os +from random import randint +from bs4 import BeautifulSoup +from zipfile import ZipFile, is_zipfile +from rarfile import RarFile, is_rarfile +from requests import Session +from guessit import guessit +from subliminal_patch.providers import Provider +from subliminal_patch.subtitle import Subtitle +from subliminal_patch.utils import sanitize +from subliminal.exceptions import ProviderError +from subliminal.utils import sanitize_release_group +from subliminal.subtitle import guess_matches +from subliminal.video import Episode, Movie +from subliminal.subtitle import fix_line_ending +from subzero.language import Language +from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST + +logger = logging.getLogger(__name__) + +class SubsUnacsSubtitle(Subtitle): + """SubsUnacs Subtitle.""" + provider_name = 'subsunacs' + + def __init__(self, langauge, filename, type): + super(SubsUnacsSubtitle, self).__init__(langauge) + self.langauge = langauge + self.filename = filename + self.type = type + + @property + def id(self): + return self.filename + + def get_matches(self, video): + matches = set() + + video_filename = video.name + video_filename = os.path.basename(video_filename) + video_filename, _ = os.path.splitext(video_filename) + video_filename = sanitize_release_group(video_filename) + + subtitle_filename = self.filename + subtitle_filename = os.path.basename(subtitle_filename) + subtitle_filename, _ = os.path.splitext(subtitle_filename) + subtitle_filename = sanitize_release_group(subtitle_filename) + + if video_filename == subtitle_filename: + matches.add('hash') + + matches |= guess_matches(video, guessit(self.filename, {'type': self.type})) + + matches.add(id(self)) + return matches + + +class SubsUnacsProvider(Provider): + """SubsUnacs Provider.""" + languages = {Language('por', 'BR')} | {Language(l) for l in [ + 'bul', 'eng' + ]} + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + self.session.headers["Accept-Language"] = "en-US,en;q=0.5" + self.session.headers["Accept-Encoding"] = "gzip, deflate, br" + self.session.headers["DNT"] = "1" + self.session.headers["Connection"] = "keep-alive" + self.session.headers["Upgrade-Insecure-Requests"] = "1" + self.session.headers["Cache-Control"] = "max-age=0" + + def terminate(self): + self.session.close() + + def query(self, language, video): + subtitles = [] + isEpisode = isinstance(video, Episode) + + params = { + 'm': '', + 'l': 0, + 'c': '', + 'y': '', + 'action': " Търси ", + 'a': '', + 'd': '', + 'u': '', + 'g': '', + 't': '', + 'imdbcheck': 1} + + if isEpisode: + params['m'] = "%s %02d %02d" % (sanitize(video.series), video.season, video.episode) + else: + params['y'] = video.year + params['m'] = (video.title) + + if language == 'en' or language == 'eng': + params['l'] = 1 + + logger.info('Searching subtitle %r', params) + response = self.session.post('https://subsunacs.net/search.php', params=params, allow_redirects=False, timeout=10, headers={ + 'Referer': 'https://subsunacs.net/index.php', + }) + + response.raise_for_status() + + if response.status_code != 200: + logger.debug('No subtitles found') + return subtitles + + soup = BeautifulSoup(response.content, 'html.parser') + rows = soup.findAll('td', {'class': 'tdMovie'}) + + # Search on first 10 rows only + for row in rows[:10]: + element = row.find('a', {'class': 'tooltip'}) + if element: + link = element.get('href') + logger.info('Found subtitle link %r', link) + subtitles = subtitles + self.download_archive_and_add_subtitle_files('https://subsunacs.net' + link, language, video) + + return subtitles + + def list_subtitles(self, video, languages): + return [s for l in languages for s in self.query(l, video)] + + def download_subtitle(self, subtitle): + pass + + def process_archive_subtitle_files(self, archiveStream, language, video): + subtitles = [] + type = 'episode' if isinstance(video, Episode) else 'movie' + for file_name in archiveStream.namelist(): + if file_name.lower().endswith(('.srt', '.sub')): + logger.info('Found subtitle file %r', file_name) + subtitle = SubsUnacsSubtitle(language, file_name, type) + subtitle.content = archiveStream.read(file_name) + subtitles.append(subtitle) + return subtitles + + def download_archive_and_add_subtitle_files(self, link, language, video ): + logger.info('Downloading subtitle %r', link) + request = self.session.get(link, headers={ + 'Referer': 'https://subsunacs.net/search.php' + }) + request.raise_for_status() + + archive_stream = io.BytesIO(request.content) + if is_rarfile(archive_stream): + return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) + elif is_zipfile(archive_stream): + return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) + else: + raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/subz.py b/libs/subliminal_patch/providers/subz.py new file mode 100644 index 000000000..dc95cb8d7 --- /dev/null +++ b/libs/subliminal_patch/providers/subz.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- +import io +import json +import logging +import os + +import rarfile +import re +import zipfile + +from subzero.language import Language +from guessit import guessit +from requests import Session +from six import text_type + +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal import __short_version__ +from subliminal.cache import SHOW_EXPIRATION_TIME, region +from subliminal.score import get_equivalent_release_groups +from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.utils import sanitize, sanitize_release_group +from subliminal.video import Episode, Movie + +logger = logging.getLogger(__name__) + +episode_re = re.compile(r'^S(\d{2})E(\d{2})$') + + +class SubzSubtitle(Subtitle): + """Subz Subtitle.""" + provider_name = 'subz' + + def __init__(self, language, page_link, series, season, episode, title, year, version, download_link): + super(SubzSubtitle, self).__init__(language, page_link=page_link) + self.series = series + self.season = season + self.episode = episode + self.title = title + self.year = year + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'windows-1253' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + video_type = None + + # episode + if isinstance(video, Episode): + video_type = 'episode' + # series name + if video.series and sanitize(self.series) in ( + sanitize(name) for name in [video.series] + video.alternative_series): + matches.add('series') + # season + if video.season and self.season == video.season: + matches.add('season') + # episode + if video.episode and self.episode == video.episode: + matches.add('episode') + # title of the episode + if video.title and sanitize(self.title) == sanitize(video.title): + matches.add('title') + # year + if video.original_series and self.year is None or video.year and video.year == self.year: + matches.add('year') + # movie + elif isinstance(video, Movie): + video_type = 'movie' + # title + if video.title and (sanitize(self.title) in ( + sanitize(name) for name in [video.title] + video.alternative_titles)): + matches.add('title') + # year + if video.year and self.year == video.year: + matches.add('year') + + # release_group + if (video.release_group and self.version and + any(r in sanitize_release_group(self.version) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): + matches.add('release_group') + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': video_type}), partial=True) + + return matches + + +class SubzProvider(Provider): + """Subz Provider.""" + languages = {Language(l) for l in ['ell']} + server_url = 'https://subz.xyz' + sign_in_url = '/sessions' + sign_out_url = '/logout' + search_url = '/typeahead/{}' + episode_link = '/series/{show_id}/seasons/{season:d}/episodes/{episode:d}' + movie_link = '/movies/{}' + subtitle_class = SubzSubtitle + + def __init__(self): + self.logged_in = False + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + def terminate(self): + self.session.close() + + def get_show_ids(self, title, year=None, is_episode=True, country_code=None): + """Get the best matching show id for `series`, `year` and `country_code`. + + First search in the result of :meth:`_get_show_suggestions`. + + :param title: show title. + :param year: year of the show, if any. + :type year: int + :param is_episode: if the search is for episode. + :type is_episode: bool + :param country_code: country code of the show, if any. + :type country_code: str + :return: the show id, if found. + :rtype: str + + """ + title_sanitized = sanitize(title).lower() + show_ids = self._get_suggestions(title, is_episode) + + matched_show_ids = [] + for show in show_ids: + show_id = None + # attempt with country + if not show_id and country_code: + logger.debug('Getting show id with country') + if sanitize(show['title']) == text_type('{title} {country}').format(title=title_sanitized, + country=country_code.lower()): + show_id = show['link'].split('/')[-1] + + # attempt with year + if not show_id and year: + logger.debug('Getting show id with year') + if sanitize(show['title']) == text_type('{title} {year}').format(title=title_sanitized, year=year): + show_id = show['link'].split('/')[-1] + + # attempt clean + if not show_id: + logger.debug('Getting show id') + show_id = show['link'].split('/')[-1] if sanitize(show['title']) == title_sanitized else None + + if show_id: + matched_show_ids.append(show_id) + + return matched_show_ids + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, to_str=text_type, + should_cache_fn=lambda value: value) + def _get_suggestions(self, title, is_episode=True): + """Search the show or movie id from the `title` and `year`. + + :param str title: title of the show. + :param is_episode: if the search is for episode. + :type is_episode: bool + :return: the show suggestions found. + :rtype: dict + + """ + # make the search + logger.info('Searching show ids with %r', title) + r = self.session.get(self.server_url + text_type(self.search_url).format(title), timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return {} + + show_type = 'series' if is_episode else 'movie' + parsed_suggestions = [s for s in json.loads(r.text) if 'type' in s and s['type'] == show_type] + logger.debug('Found suggestions: %r', parsed_suggestions) + + return parsed_suggestions + + def query(self, show_id, series, season, episode, title): + # get the season list of the show + logger.info('Getting the subtitle list of show id %s', show_id) + is_episode = False + if all((show_id, season, episode)): + is_episode = True + page_link = self.server_url + self.episode_link.format(show_id=show_id, season=season, episode=episode) + elif all((show_id, title)): + page_link = self.server_url + self.movie_link.format(show_id) + else: + return [] + + r = self.session.get(page_link, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + year_num = None + if not is_episode: + year_num = int(soup.select_one('span.year').text) + show_title = str(soup.select_one('#summary-wrapper > div.summary h1').contents[0]).strip() + + subtitles = [] + # loop over episode rows + for subtitle in soup.select('div[id="subtitles"] tr[data-id]'): + # read common info + version = subtitle.find('td', {'class': 'name'}).text + download_link = subtitle.find('a', {'class': 'btn-success'})['href'].strip('\'') + + # read the episode info + if is_episode: + episode_numbers = soup.select_one('#summary-wrapper > div.container.summary span.main-title-sxe').text + season_num = None + episode_num = None + matches = episode_re.match(episode_numbers.strip()) + if matches: + season_num = int(matches.group(1)) + episode_num = int(matches.group(2)) + + episode_title = soup.select_one('#summary-wrapper > div.container.summary span.main-title').text + + subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, show_title, season_num, + episode_num, episode_title, year_num, version, download_link) + # read the movie info + else: + subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, None, None, None, show_title, + year_num, version, download_link) + + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + # lookup show_id + if isinstance(video, Episode): + titles = [video.series] + video.alternative_series + elif isinstance(video, Movie): + titles = [video.title] + video.alternative_titles + else: + titles = [] + + show_ids = None + for title in titles: + show_ids = self.get_show_ids(title, video.year, isinstance(video, Episode)) + if show_ids is not None and len(show_ids) > 0: + break + + subtitles = [] + # query for subtitles with the show_id + for show_id in show_ids: + if isinstance(video, Episode): + subtitles += [s for s in self.query(show_id, video.series, video.season, video.episode, video.title) + if s.language in languages and s.season == video.season and s.episode == video.episode] + elif isinstance(video, Movie): + subtitles += [s for s in self.query(show_id, None, None, None, video.title) + if s.language in languages and s.year == video.year] + + return subtitles + + def download_subtitle(self, subtitle): + if isinstance(subtitle, SubzSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + archive = _get_archive(r.content) + + subtitle_content = _get_subtitle_from_archive(archive) + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug('Could not extract subtitle from %r', archive) + + +def _get_archive(content): + # open the archive + archive_stream = io.BytesIO(content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive = rarfile.RarFile(archive_stream) + elif zipfile.is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive = zipfile.ZipFile(archive_stream) + + return archive + + +def _get_subtitle_from_archive(archive): + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + return archive.read(name) + + return None diff --git a/libs/subliminal_patch/providers/titlovi.py b/libs/subliminal_patch/providers/titlovi.py index ec339fef8..860932ca5 100644 --- a/libs/subliminal_patch/providers/titlovi.py +++ b/libs/subliminal_patch/providers/titlovi.py @@ -4,6 +4,7 @@ import io import logging import math import re +import time import rarfile @@ -23,6 +24,7 @@ from subliminal.utils import sanitize_release_group from subliminal.subtitle import guess_matches from subliminal.video import Episode, Movie from subliminal.subtitle import fix_line_ending +from subliminal_patch.pitcher import pitchers, load_verification, store_verification from subzero.language import Language from random import randint @@ -142,6 +144,7 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): logger.debug('User-Agent set to %s', self.session.headers['User-Agent']) self.session.headers['Referer'] = self.server_url logger.debug('Referer set to %s', self.session.headers['Referer']) + load_verification("titlovi", self.session) def terminate(self): self.session.close() @@ -182,110 +185,144 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): r = self.session.get(self.search_url, params=params, timeout=10) r.raise_for_status() except RequestException as e: - logger.exception('RequestException %s', e) - break - - try: - soup = BeautifulSoup(r.content, 'lxml') - - # number of results - result_count = int(soup.select_one('.results_count b').string) - except: - result_count = None - - # exit if no results - if not result_count: - if not subtitles: - logger.debug('No subtitles found') - else: - logger.debug("No more subtitles found") - break - - # number of pages with results - pages = int(math.ceil(result_count / float(items_per_page))) - - # get current page - if 'pg' in params: - current_page = int(params['pg']) - - try: - sublist = soup.select('section.titlovi > ul.titlovi > li.subtitleContainer.canEdit') - for sub in sublist: - # subtitle id - sid = sub.find(attrs={'data-id': True}).attrs['data-id'] - # get download link - download_link = self.download_url + sid - # title and alternate title - match = title_re.search(sub.a.string) - if match: - _title = match.group('title') - alt_title = match.group('altitle') + captcha_passed = False + if e.response.status_code == 403 and "data-sitekey" in e.response.content: + logger.info('titlovi: Solving captcha. This might take a couple of minutes, but should only ' + 'happen once every so often') + + site_key = re.search(r'data-sitekey="(.+?)"', e.response.content).group(1) + challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', e.response.content).group(1) + challenge_ray = re.search(r'data-ray="(.+?)"', e.response.content).group(1) + if not all([site_key, challenge_s, challenge_ray]): + raise Exception("titlovi: Captcha site-key not found!") + + pitcher = pitchers.get_pitcher()("titlovi", e.request.url, site_key, + user_agent=self.session.headers["User-Agent"], + cookies=self.session.cookies.get_dict(), + is_invisible=True) + + result = pitcher.throw() + if not result: + raise Exception("titlovi: Couldn't solve captcha!") + + s_params = { + "s": challenge_s, + "id": challenge_ray, + "g-recaptcha-response": result, + } + r = self.session.get(self.server_url + "/cdn-cgi/l/chk_captcha", params=s_params, timeout=10, + allow_redirects=False) + r.raise_for_status() + r = self.session.get(self.search_url, params=params, timeout=10) + r.raise_for_status() + store_verification("titlovi", self.session) + captcha_passed = True + + if not captcha_passed: + logger.exception('RequestException %s', e) + break + else: + try: + soup = BeautifulSoup(r.content, 'lxml') + + # number of results + result_count = int(soup.select_one('.results_count b').string) + except: + result_count = None + + # exit if no results + if not result_count: + if not subtitles: + logger.debug('No subtitles found') else: - continue - - # page link - page_link = self.server_url + sub.a.attrs['href'] - # subtitle language - match = lang_re.search(sub.select_one('.lang').attrs['src']) - if match: - try: - # decode language - lang = Language.fromtitlovi(match.group('lang')+match.group('script')) - except ValueError: + logger.debug("No more subtitles found") + break + + # number of pages with results + pages = int(math.ceil(result_count / float(items_per_page))) + + # get current page + if 'pg' in params: + current_page = int(params['pg']) + + try: + sublist = soup.select('section.titlovi > ul.titlovi > li.subtitleContainer.canEdit') + for sub in sublist: + # subtitle id + sid = sub.find(attrs={'data-id': True}).attrs['data-id'] + # get download link + download_link = self.download_url + sid + # title and alternate title + match = title_re.search(sub.a.string) + if match: + _title = match.group('title') + alt_title = match.group('altitle') + else: continue - # relase year or series start year - match = year_re.search(sub.find(attrs={'data-id': True}).parent.i.string) - if match: - r_year = int(match.group('year')) - # fps - match = fps_re.search(sub.select_one('.fps').string) - if match: - fps = match.group('fps') - # releases - releases = str(sub.select_one('.fps').parent.contents[0].string) - - # handle movies and series separately - if is_episode: - # season and episode info - sxe = sub.select_one('.s0xe0y').string - r_season = None - r_episode = None - if sxe: - match = season_re.search(sxe) - if match: - r_season = int(match.group('season')) - match = episode_re.search(sxe) - if match: - r_episode = int(match.group('episode')) - - subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, - alt_title=alt_title, season=r_season, episode=r_episode, - year=r_year, fps=fps, - asked_for_release_group=video.release_group, - asked_for_episode=episode) - else: - subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, - alt_title=alt_title, year=r_year, fps=fps, - asked_for_release_group=video.release_group) - logger.debug('Found subtitle %r', subtitle) - - # prime our matches so we can use the values later - subtitle.get_matches(video) - - # add found subtitles - subtitles.append(subtitle) - - finally: - soup.decompose() - - # stop on last page - if current_page >= pages: - break - - # increment current page - params['pg'] = current_page + 1 - logger.debug('Getting page %d', params['pg']) + # page link + page_link = self.server_url + sub.a.attrs['href'] + # subtitle language + match = lang_re.search(sub.select_one('.lang').attrs['src']) + if match: + try: + # decode language + lang = Language.fromtitlovi(match.group('lang')+match.group('script')) + except ValueError: + continue + + # relase year or series start year + match = year_re.search(sub.find(attrs={'data-id': True}).parent.i.string) + if match: + r_year = int(match.group('year')) + # fps + match = fps_re.search(sub.select_one('.fps').string) + if match: + fps = match.group('fps') + # releases + releases = str(sub.select_one('.fps').parent.contents[0].string) + + # handle movies and series separately + if is_episode: + # season and episode info + sxe = sub.select_one('.s0xe0y').string + r_season = None + r_episode = None + if sxe: + match = season_re.search(sxe) + if match: + r_season = int(match.group('season')) + match = episode_re.search(sxe) + if match: + r_episode = int(match.group('episode')) + + subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, + alt_title=alt_title, season=r_season, episode=r_episode, + year=r_year, fps=fps, + asked_for_release_group=video.release_group, + asked_for_episode=episode) + else: + subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, + alt_title=alt_title, year=r_year, fps=fps, + asked_for_release_group=video.release_group) + logger.debug('Found subtitle %r', subtitle) + + # prime our matches so we can use the values later + subtitle.get_matches(video) + + # add found subtitles + subtitles.append(subtitle) + + finally: + soup.decompose() + + # stop on last page + if current_page >= pages: + break + + # increment current page + params['pg'] = current_page + 1 + logger.debug('Getting page %d', params['pg']) return subtitles diff --git a/libs/subliminal_patch/providers/xsubs.py b/libs/subliminal_patch/providers/xsubs.py new file mode 100644 index 000000000..102571dd9 --- /dev/null +++ b/libs/subliminal_patch/providers/xsubs.py @@ -0,0 +1,302 @@ +# -*- coding: utf-8 -*- +import logging +import re + +from subzero.language import Language +from guessit import guessit +from requests import Session + +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal import __short_version__ +from subliminal.cache import SHOW_EXPIRATION_TIME, region +from subliminal.exceptions import AuthenticationError, ConfigurationError +from subliminal.score import get_equivalent_release_groups +from subliminal.subtitle import Subtitle, fix_line_ending, guess_matches +from subliminal.utils import sanitize, sanitize_release_group +from subliminal.video import Episode + +logger = logging.getLogger(__name__) +article_re = re.compile(r'^([A-Za-z]{1,3}) (.*)$') + + +class XSubsSubtitle(Subtitle): + """XSubs Subtitle.""" + provider_name = 'xsubs' + + def __init__(self, language, page_link, series, season, episode, year, title, version, download_link): + super(XSubsSubtitle, self).__init__(language, page_link=page_link) + self.series = series + self.season = season + self.episode = episode + self.year = year + self.title = title + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'windows-1253' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + if isinstance(video, Episode): + # series name + if video.series and sanitize(self.series) in ( + sanitize(name) for name in [video.series] + video.alternative_series): + matches.add('series') + # season + if video.season and self.season == video.season: + matches.add('season') + # episode + if video.episode and self.episode == video.episode: + matches.add('episode') + # title of the episode + if video.title and sanitize(self.title) == sanitize(video.title): + matches.add('title') + # year + if video.original_series and self.year is None or video.year and video.year == self.year: + matches.add('year') + # release_group + if (video.release_group and self.version and + any(r in sanitize_release_group(self.version) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): + matches.add('release_group') + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) + + return matches + + +class XSubsProvider(Provider): + """XSubs Provider.""" + languages = {Language(l) for l in ['ell']} + video_types = (Episode,) + server_url = 'http://xsubs.tv' + sign_in_url = '/xforum/account/signin/' + sign_out_url = '/xforum/account/signout/' + all_series_url = '/series/all.xml' + series_url = '/series/{:d}/main.xml' + season_url = '/series/{show_id:d}/{season:d}.xml' + page_link = '/ice/xsw.xml?srsid={show_id:d}#{season_id:d};{season:d}' + download_link = '/xthru/getsub/{:d}' + subtitle_class = XSubsSubtitle + + def __init__(self, username=None, password=None): + if any((username, password)) and not all((username, password)): + raise ConfigurationError('Username and password must be specified') + + self.username = username + self.password = password + self.logged_in = False + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + # login + if self.username and self.password: + logger.info('Logging in') + self.session.get(self.server_url + self.sign_in_url) + data = {'username': self.username, + 'password': self.password, + 'csrfmiddlewaretoken': self.session.cookies['csrftoken']} + r = self.session.post(self.server_url + self.sign_in_url, data, allow_redirects=False, timeout=10) + + if r.status_code != 302: + raise AuthenticationError(self.username) + + logger.debug('Logged in') + self.logged_in = True + + def terminate(self): + # logout + if self.logged_in: + logger.info('Logging out') + r = self.session.get(self.server_url + self.sign_out_url, timeout=10) + r.raise_for_status() + logger.debug('Logged out') + self.logged_in = False + + self.session.close() + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value) + def _get_show_ids(self): + # get the shows page + logger.info('Getting show ids') + r = self.session.get(self.server_url + self.all_series_url, timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # populate the show ids + show_ids = {} + for show_category in soup.findAll('seriesl'): + if show_category.attrs['category'] == u'Σειρές': + for show in show_category.findAll('series'): + show_ids[sanitize(show.text)] = int(show['srsid']) + break + logger.debug('Found %d show ids', len(show_ids)) + + return show_ids + + def get_show_id(self, series_names, year=None, country_code=None): + series_sanitized_names = [] + for name in series_names: + sanitized_name = sanitize(name) + series_sanitized_names.append(sanitized_name) + alternative_name = _get_alternative_name(sanitized_name) + if alternative_name: + series_sanitized_names.append(alternative_name) + + show_ids = self._get_show_ids() + show_id = None + + for series_sanitized in series_sanitized_names: + # attempt with country + if not show_id and country_code: + logger.debug('Getting show id with country') + show_id = show_ids.get('{series} {country}'.format(series=series_sanitized, + country=country_code.lower())) + + # attempt with year + if not show_id and year: + logger.debug('Getting show id with year') + show_id = show_ids.get('{series} {year:d}'.format(series=series_sanitized, year=year)) + + # attempt with article at the end + if not show_id and year: + logger.debug('Getting show id with year in brackets') + show_id = show_ids.get('{series} [{year:d}]'.format(series=series_sanitized, year=year)) + + # attempt clean + if not show_id: + logger.debug('Getting show id') + show_id = show_ids.get(series_sanitized) + + if show_id: + break + + return int(show_id) if show_id else None + + def query(self, show_id, series, season, year=None, country=None): + # get the season list of the show + logger.info('Getting the season list of show id %d', show_id) + r = self.session.get(self.server_url + self.series_url.format(show_id), timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + series_title = soup.find('name').text + + # loop over season rows + seasons = soup.findAll('series_group') + season_id = None + + for season_row in seasons: + try: + parsed_season = int(season_row['ssnnum']) + if parsed_season == season: + season_id = int(season_row['ssnid']) + break + except (ValueError, TypeError): + continue + + if season_id is None: + logger.debug('Season not found in provider') + return [] + + # get the subtitle list of the season + logger.info('Getting the subtitle list of season %d', season) + r = self.session.get(self.server_url + self.season_url.format(show_id=show_id, season=season_id), timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + subtitles = [] + # loop over episode rows + for episode in soup.findAll('subg'): + # read the episode info + etitle = episode.find('etitle') + if etitle is None: + continue + + episode_num = int(etitle['number'].split('-')[0]) + + sgt = episode.find('sgt') + if sgt is None: + continue + + season_num = int(sgt['ssnnum']) + + # filter out unreleased subtitles + for subtitle in episode.findAll('sr'): + if subtitle['published_on'] == '': + continue + + page_link = self.server_url + self.page_link.format(show_id=show_id, season_id=season_id, + season=season_num) + episode_title = etitle['title'] + version = subtitle.fmt.text + ' ' + subtitle.team.text + download_link = self.server_url + self.download_link.format(int(subtitle['rlsid'])) + + subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, series_title, season_num, + episode_num, year, episode_title, version, download_link) + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + if isinstance(video, Episode): + # lookup show_id + titles = [video.series] + video.alternative_series + show_id = self.get_show_id(titles, video.year) + + # query for subtitles with the show_id + if show_id: + subtitles = [s for s in self.query(show_id, video.series, video.season, video.year) + if s.language in languages and s.season == video.season and s.episode == video.episode] + if subtitles: + return subtitles + else: + logger.error('No show id found for %r (%r)', video.series, {'year': video.year}) + + return [] + + def download_subtitle(self, subtitle): + if isinstance(subtitle, XSubsSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, + timeout=10) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + subtitle.content = fix_line_ending(r.content) + + +def _get_alternative_name(series): + article_match = article_re.match(series) + if article_match: + return '{series} {article}'.format(series=article_match.group(2), article=article_match.group(1)) + + return None From a619bbbe99c5a4530ed06d0613fd3deac97fb77a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 21:27:41 -0400 Subject: [PATCH 15/19] Continuing development. --- bazarr/init.py | 10 ++++++++++ views/settings.tpl | 22 +++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/bazarr/init.py b/bazarr/init.py index eb3af0ce3..09d4ad639 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -17,6 +17,16 @@ from get_args import args # set subliminal_patch user agent os.environ["SZ_USER_AGENT"] = "Bazarr/1" +# set anti-captcha provider and key +if settings.general.anti_captcha_provider == 'anti-captcha': + os.environ["ANTICAPTCHA_CLASS"] = 'AntiCaptchaProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.anticaptcha.anti_captcha_key +elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': + os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join(settings.deathbycaptcha.username, settings.deathbycaptcha.password) +else: + os.environ["ANTICAPTCHA_CLASS"] = None + # Check if args.config_dir exist if not os.path.exists(args.config_dir): # Create config_dir directory tree diff --git a/views/settings.tpl b/views/settings.tpl index d3af01107..37c51c674 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -1256,7 +1256,17 @@
- + +
+ +
+ +
+
+
+
@@ -1268,6 +1278,16 @@
+
+
+
+ +
+ +
+
From 7cc28a1da846b79c65458c9b48ede05dbb95be1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 21:30:54 -0400 Subject: [PATCH 16/19] Continuing development. --- views/settings.tpl | 2 +- views/wizard.tpl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/views/settings.tpl b/views/settings.tpl index 37c51c674..909b976c4 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -1266,7 +1266,7 @@
- +
diff --git a/views/wizard.tpl b/views/wizard.tpl index 49a215431..6619863d9 100644 --- a/views/wizard.tpl +++ b/views/wizard.tpl @@ -414,7 +414,7 @@
- +
@@ -884,7 +884,7 @@
- +
From 7314f34eb259f46e01e7ef6e2d1dcc299772b35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 22:20:59 -0400 Subject: [PATCH 17/19] Continuing development. --- bazarr/init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bazarr/init.py b/bazarr/init.py index 09d4ad639..dacc3d6ee 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -25,7 +25,7 @@ elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join(settings.deathbycaptcha.username, settings.deathbycaptcha.password) else: - os.environ["ANTICAPTCHA_CLASS"] = None + os.environ["ANTICAPTCHA_CLASS"] = '' # Check if args.config_dir exist if not os.path.exists(args.config_dir): From e1b8d0589e9129ba12d6230182976cfb105248ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 22:25:19 -0400 Subject: [PATCH 18/19] Continuing development. --- bazarr/init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bazarr/init.py b/bazarr/init.py index dacc3d6ee..284159c0a 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -23,7 +23,7 @@ if settings.general.anti_captcha_provider == 'anti-captcha': os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.anticaptcha.anti_captcha_key elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' - os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join(settings.deathbycaptcha.username, settings.deathbycaptcha.password) + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join({settings.deathbycaptcha.username, settings.deathbycaptcha.password}) else: os.environ["ANTICAPTCHA_CLASS"] = '' From d7bfa7a43849c1a087a954c08bd690c55b96213d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Sun, 7 Apr 2019 22:29:11 -0400 Subject: [PATCH 19/19] Continuing development. --- bazarr/main.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bazarr/main.py b/bazarr/main.py index 90e5d7931..435627082 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -1314,6 +1314,18 @@ def save_settings(): settings.anticaptcha.anti_captcha_key = text_type(settings_anti_captcha_key) settings.deathbycaptcha.username = text_type(settings_death_by_captcha_username) settings.deathbycaptcha.password = text_type(settings_death_by_captcha_password) + + # set anti-captcha provider and key + if settings.general.anti_captcha_provider == 'anti-captcha': + os.environ["ANTICAPTCHA_CLASS"] = 'AntiCaptchaProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.anticaptcha.anti_captcha_key + elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': + os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join( + {settings.deathbycaptcha.username, settings.deathbycaptcha.password}) + else: + os.environ["ANTICAPTCHA_CLASS"] = '' + settings.general.minimum_score_movie = text_type(settings_general_minimum_score_movies) settings.general.use_embedded_subs = text_type(settings_general_embedded) settings.general.adaptive_searching = text_type(settings_general_adaptive_searching)