diff --git a/bazarr/config.py b/bazarr/config.py index 2d3405f04..07a8d6965 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -41,7 +41,8 @@ defaults = { 'subfolder_custom': '', 'upgrade_subs': 'True', 'days_to_upgrade_subs': '7', - 'upgrade_manual': 'True' + 'upgrade_manual': 'True', + 'anti_captcha_provider': 'None' }, 'auth': { 'type': 'None', @@ -98,7 +99,15 @@ defaults = { }, 'assrt': { 'token': '' - }} + }, + 'anticaptcha': { + 'anti_captcha_key': '' + }, + 'deathbycaptcha': { + 'username': '', + 'password': '' + } +} settings = simpleconfigparser(defaults=defaults) settings.read(os.path.join(args.config_dir, 'config', 'config.ini')) diff --git a/bazarr/init.py b/bazarr/init.py index eb3af0ce3..284159c0a 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -17,6 +17,16 @@ from get_args import args # set subliminal_patch user agent os.environ["SZ_USER_AGENT"] = "Bazarr/1" +# set anti-captcha provider and key +if settings.general.anti_captcha_provider == 'anti-captcha': + os.environ["ANTICAPTCHA_CLASS"] = 'AntiCaptchaProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.anticaptcha.anti_captcha_key +elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': + os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join({settings.deathbycaptcha.username, settings.deathbycaptcha.password}) +else: + os.environ["ANTICAPTCHA_CLASS"] = '' + # Check if args.config_dir exist if not os.path.exists(args.config_dir): # Create config_dir directory tree diff --git a/bazarr/logger.py b/bazarr/logger.py index 3507767bd..e95d2259f 100644 --- a/bazarr/logger.py +++ b/bazarr/logger.py @@ -67,6 +67,8 @@ def configure_logging(debug=False): fh.setFormatter(f) fh.addFilter(BlacklistFilter()) fh.addFilter(PublicIPFilter()) + fh.setLevel(log_level) + logger.addHandler(fh) if debug: logging.getLogger("apscheduler").setLevel(logging.DEBUG) @@ -90,8 +92,7 @@ def configure_logging(debug=False): logging.getLogger("rebulk").setLevel(logging.WARNING) logging.getLogger("stevedore.extension").setLevel(logging.CRITICAL) logging.getLogger("geventwebsocket.handler").setLevel(logging.WARNING) - fh.setLevel(log_level) - logger.addHandler(fh) + class MyFilter(logging.Filter): diff --git a/bazarr/main.py b/bazarr/main.py index ea3e4414a..326daf65b 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -602,17 +602,17 @@ def search_json(query): search_list = [] if settings.general.getboolean('use_sonarr'): - c.execute("SELECT title, sonarrSeriesId FROM table_shows WHERE title LIKE ? ORDER BY title", + c.execute("SELECT title, sonarrSeriesId, year FROM table_shows WHERE title LIKE ? ORDER BY title", ('%' + query + '%',)) series = c.fetchall() for serie in series: - search_list.append(dict([('name', serie[0]), ('url', base_url + 'episodes/' + str(serie[1]))])) + search_list.append(dict([('name', serie[0] + ' (' + serie[2] + ')'), ('url', base_url + 'episodes/' + str(serie[1]))])) if settings.general.getboolean('use_radarr'): - c.execute("SELECT title, radarrId FROM table_movies WHERE title LIKE ? ORDER BY title", ('%' + query + '%',)) + c.execute("SELECT title, radarrId, year FROM table_movies WHERE title LIKE ? ORDER BY title", ('%' + query + '%',)) movies = c.fetchall() for movie in movies: - search_list.append(dict([('name', movie[0]), ('url', base_url + 'movie/' + str(movie[1]))])) + search_list.append(dict([('name', movie[0] + ' (' + movie[2] + ')'), ('url', base_url + 'movie/' + str(movie[1]))])) c.close() response.content_type = 'application/json' @@ -1275,6 +1275,10 @@ def save_settings(): settings_upgrade_manual = 'False' else: settings_upgrade_manual = 'True' + settings_anti_captcha_provider = request.forms.get('settings_anti_captcha_provider') + settings_anti_captcha_key = request.forms.get('settings_anti_captcha_key') + settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username') + settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password') before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url), unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')), @@ -1306,6 +1310,22 @@ def save_settings(): settings.general.upgrade_subs = text_type(settings_upgrade_subs) settings.general.days_to_upgrade_subs = text_type(settings_days_to_upgrade_subs) settings.general.upgrade_manual = text_type(settings_upgrade_manual) + settings.general.anti_captcha_provider = text_type(settings_anti_captcha_provider) + settings.anticaptcha.anti_captcha_key = text_type(settings_anti_captcha_key) + settings.deathbycaptcha.username = text_type(settings_death_by_captcha_username) + settings.deathbycaptcha.password = text_type(settings_death_by_captcha_password) + + # set anti-captcha provider and key + if settings.general.anti_captcha_provider == 'anti-captcha': + os.environ["ANTICAPTCHA_CLASS"] = 'AntiCaptchaProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = settings.anticaptcha.anti_captcha_key + elif settings.general.anti_captcha_provider == 'AntiCaptchaProxyLessPitcher': + os.environ["ANTICAPTCHA_CLASS"] = 'DBCProxyLess' + os.environ["ANTICAPTCHA_ACCOUNT_KEY"] = ':'.join( + {settings.deathbycaptcha.username, settings.deathbycaptcha.password}) + else: + os.environ["ANTICAPTCHA_CLASS"] = '' + settings.general.minimum_score_movie = text_type(settings_general_minimum_score_movies) settings.general.use_embedded_subs = text_type(settings_general_embedded) settings.general.adaptive_searching = text_type(settings_general_adaptive_searching) diff --git a/libs/cfscrape.py b/libs/cfscrape.py new file mode 100644 index 000000000..15986f03a --- /dev/null +++ b/libs/cfscrape.py @@ -0,0 +1,279 @@ +import logging +import random +import time +import re + +# based off of https://gist.github.com/doko-desuka/58d9212461f62583f8df9bc6387fade2 +# and https://github.com/Anorov/cloudflare-scrape +# and https://github.com/VeNoMouS/cloudflare-scrape-js2py + +''''''''' +Disables InsecureRequestWarning: Unverified HTTPS request is being made warnings. +''''''''' +import requests +from requests.packages.urllib3.exceptions import InsecureRequestWarning + +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) +'''''' +from requests.sessions import Session +from copy import deepcopy + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + +DEFAULT_USER_AGENTS = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36", + "Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" +] + +DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS) + +BUG_REPORT = ( + "Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a " + "bug report at https://github.com/Anorov/cloudflare-scrape/issues.") + + +class CloudflareScraper(Session): + def __init__(self, *args, **kwargs): + super(CloudflareScraper, self).__init__(*args, **kwargs) + + if "requests" in self.headers["User-Agent"]: + # Spoof Firefox on Linux if no custom User-Agent has been set + self.headers["User-Agent"] = random.choice(DEFAULT_USER_AGENTS) + + def request(self, method, url, *args, **kwargs): + resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) + + # Check if Cloudflare anti-bot is on + if (resp.status_code in (503, 429) + and resp.headers.get("Server", "").startswith("cloudflare") + and b"jschl_vc" in resp.content + and b"jschl_answer" in resp.content + ): + return self.solve_cf_challenge(resp, **kwargs) + + # Otherwise, no Cloudflare anti-bot detected + return resp + + def solve_cf_challenge(self, resp, **original_kwargs): + body = resp.text + parsed_url = urlparse(resp.url) + domain = parsed_url.netloc + submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) + + cloudflare_kwargs = deepcopy(original_kwargs) + params = cloudflare_kwargs.setdefault("params", {}) + headers = cloudflare_kwargs.setdefault("headers", {}) + headers["Referer"] = resp.url + + try: + cf_delay = float(re.search('submit.*?(\d+)', body, re.DOTALL).group(1)) / 1000.0 + + form_index = body.find('id="challenge-form"') + if form_index == -1: + raise Exception('CF form not found') + sub_body = body[form_index:] + + s_match = re.search('name="s" value="(.+?)"', sub_body) + if s_match: + params["s"] = s_match.group(1) # On older variants this parameter is absent. + params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', sub_body).group(1) + params["pass"] = re.search(r'name="pass" value="(.+?)"', sub_body).group(1) + + if body.find('id="cf-dn-', form_index) != -1: + extra_div_expression = re.search('id="cf-dn-.*?>(.+?)<', sub_body).group(1) + + # Initial value. + js_answer = self.cf_parse_expression( + re.search('setTimeout\(function\(.*?:(.*?)}', body, re.DOTALL).group(1) + ) + # Extract the arithmetic operations. + builder = re.search("challenge-form'\);\s*;(.*);a.value", body, re.DOTALL).group(1) + # Remove a function semicolon before splitting on semicolons, else it messes the order. + lines = builder.replace(' return +(p)}();', '', 1).split(';') + + for line in lines: + if len(line) and '=' in line: + heading, expression = line.split('=', 1) + if 'eval(eval(atob' in expression: + # Uses the expression in an external
. + expression_value = self.cf_parse_expression(extra_div_expression) + elif '(function(p' in expression: + # Expression + domain sampling function. + expression_value = self.cf_parse_expression(expression, domain) + else: + expression_value = self.cf_parse_expression(expression) + js_answer = self.cf_arithmetic_op(heading[-1], js_answer, expression_value) + + if '+ t.length' in body: + js_answer += len(domain) # Only older variants add the domain length. + + params["jschl_answer"] = '%.10f' % js_answer + + except Exception as e: + # Something is wrong with the page. + # This may indicate Cloudflare has changed their anti-bot + # technique. If you see this and are running the latest version, + # please open a GitHub issue so I can update the code accordingly. + logging.error("[!] %s Unable to parse Cloudflare anti-bots page. " + "Try upgrading cloudflare-scrape, or submit a bug report " + "if you are running the latest version. Please read " + "https://github.com/Anorov/cloudflare-scrape#updates " + "before submitting a bug report." % e) + raise + + # Cloudflare requires a delay before solving the challenge. + # Always wait the full delay + 1s because of 'time.sleep()' imprecision. + time.sleep(cf_delay + 1.0) + + # Requests transforms any request into a GET after a redirect, + # so the redirect has to be handled manually here to allow for + # performing other types of requests even as the first request. + method = resp.request.method + cloudflare_kwargs["allow_redirects"] = False + + redirect = self.request(method, submit_url, **cloudflare_kwargs) + + if 'Location' in redirect.headers: + redirect_location = urlparse(redirect.headers["Location"]) + if not redirect_location.netloc: + redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path) + return self.request(method, redirect_url, **original_kwargs) + return self.request(method, redirect.headers["Location"], **original_kwargs) + else: + return redirect + + def cf_sample_domain_function(self, func_expression, domain): + parameter_start_index = func_expression.find('}(') + 2 + # Send the expression with the "+" char and enclosing parenthesis included, as they are + # stripped inside ".cf_parse_expression()'. + sample_index = self.cf_parse_expression( + func_expression[parameter_start_index: func_expression.rfind(')))')] + ) + return ord(domain[int(sample_index)]) + + def cf_arithmetic_op(self, op, a, b): + if op == '+': + return a + b + elif op == '/': + return a / float(b) + elif op == '*': + return a * float(b) + elif op == '-': + return a - b + else: + raise Exception('Unknown operation') + + def cf_parse_expression(self, expression, domain=None): + + def _get_jsfuck_number(section): + digit_expressions = section.replace('!+[]', '1').replace('+!![]', '1').replace('+[]', '0').split('+') + return int( + # Form a number string, with each digit as the sum of the values inside each parenthesis block. + ''.join( + str(sum(int(digit_char) for digit_char in digit_expression[1:-1])) # Strip the parenthesis. + for digit_expression in digit_expressions + ) + ) + + if '/' in expression: + dividend, divisor = expression.split('/') + dividend = dividend[2:-1] # Strip the leading '+' char and the enclosing parenthesis. + + if domain: + # 2019-04-02: At this moment, this extra domain sampling function always appears on the + # divisor side, at the end. + divisor_a, divisor_b = divisor.split('))+(') + divisor_a = _get_jsfuck_number(divisor_a[5:]) # Left-strip the sequence of "(+(+(". + divisor_b = self.cf_sample_domain_function(divisor_b, domain) + return _get_jsfuck_number(dividend) / float(divisor_a + divisor_b) + else: + divisor = divisor[2:-1] + return _get_jsfuck_number(dividend) / float(_get_jsfuck_number(divisor)) + else: + return _get_jsfuck_number(expression[2:-1]) + + @classmethod + def create_scraper(cls, sess=None, **kwargs): + """ + Convenience function for creating a ready-to-go requests.Session (subclass) object. + """ + scraper = cls() + + if sess: + attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"] + for attr in attrs: + val = getattr(sess, attr, None) + if val: + setattr(scraper, attr, val) + + return scraper + + ## Functions for integrating cloudflare-scrape with other applications and scripts + + @classmethod + def get_tokens(cls, url, user_agent=None, **kwargs): + scraper = cls.create_scraper() + if user_agent: + scraper.headers["User-Agent"] = user_agent + + try: + resp = scraper.get(url, **kwargs) + resp.raise_for_status() + except Exception as e: + logging.error("'%s' returned an error. Could not collect tokens." % url) + raise + + domain = urlparse(resp.url).netloc + cookie_domain = None + + for d in scraper.cookies.list_domains(): + if d.startswith(".") and d in ("." + domain): + cookie_domain = d + break + else: + raise ValueError( + "Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") + + return ({ + "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain), + "cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain) + }, + scraper.headers["User-Agent"] + ) + + def get_live_tokens(self, domain): + for d in self.cookies.list_domains(): + if d.startswith(".") and d in ("." + domain): + cookie_domain = d + break + else: + raise ValueError( + "Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") + + return ({ + "__cfduid": self.cookies.get("__cfduid", "", domain=cookie_domain), + "cf_clearance": self.cookies.get("cf_clearance", "", domain=cookie_domain) + }, + self.headers["User-Agent"] + ) + + @classmethod + def get_cookie_string(cls, url, user_agent=None, **kwargs): + """ + Convenience function for building a Cookie HTTP header value. + """ + tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs) + return "; ".join("=".join(pair) for pair in tokens.items()), user_agent + + +create_scraper = CloudflareScraper.create_scraper +get_tokens = CloudflareScraper.get_tokens +get_cookie_string = CloudflareScraper.get_cookie_string diff --git a/libs/deathbycaptcha.py b/libs/deathbycaptcha.py new file mode 100644 index 000000000..3c2fafb77 --- /dev/null +++ b/libs/deathbycaptcha.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +"""Death by Captcha HTTP and socket API clients. + +There are two types of Death by Captcha (DBC hereinafter) API: HTTP and +socket ones. Both offer the same functionalily, with the socket API +sporting faster responses and using way less connections. + +To access the socket API, use SocketClient class; for the HTTP API, use +HttpClient class. Both are thread-safe. SocketClient keeps a persistent +connection opened and serializes all API requests sent through it, thus +it is advised to keep a pool of them if you're script is heavily +multithreaded. + +Both SocketClient and HttpClient give you the following methods: + +get_user() + Returns your DBC account details as a dict with the following keys: + + "user": your account numeric ID; if login fails, it will be the only + item with the value of 0; + "rate": your CAPTCHA rate, i.e. how much you will be charged for one + solved CAPTCHA in US cents; + "balance": your DBC account balance in US cents; + "is_banned": flag indicating whether your account is suspended or not. + +get_balance() + Returns your DBC account balance in US cents. + +get_captcha(cid) + Returns an uploaded CAPTCHA details as a dict with the following keys: + + "captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will + be the only item with the value of 0; + "text": the CAPTCHA text, if solved, otherwise None; + "is_correct": flag indicating whether the CAPTCHA was solved correctly + (DBC can detect that in rare cases). + + The only argument `cid` is the CAPTCHA numeric ID. + +get_text(cid) + Returns an uploaded CAPTCHA text (None if not solved). The only argument + `cid` is the CAPTCHA numeric ID. + +report(cid) + Reports an incorrectly solved CAPTCHA. The only argument `cid` is the + CAPTCHA numeric ID. Returns True on success, False otherwise. + +upload(captcha) + Uploads a CAPTCHA. The only argument `captcha` can be either file-like + object (any object with `read` method defined, actually, so StringIO + will do), or CAPTCHA image file name. On successul upload you'll get + the CAPTCHA details dict (see get_captcha() method). + + NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have + to poll for its status periodically using get_captcha() or get_text() + method until the CAPTCHA is solved and you get the text. + +decode(captcha, timeout=DEFAULT_TIMEOUT) + A convenient method that uploads a CAPTCHA and polls for its status + periodically, but no longer than `timeout` (defaults to 60 seconds). + If solved, you'll get the CAPTCHA details dict (see get_captcha() + method for details). See upload() method for details on `captcha` + argument. + +Visit http://www.deathbycaptcha.com/user/api for updates. + +""" + +import base64 +import binascii +import errno +import imghdr +import random +import os +import select +import socket +import sys +import threading +import time +import urllib +import urllib2 +try: + from json import read as json_decode, write as json_encode +except ImportError: + try: + from json import loads as json_decode, dumps as json_encode + except ImportError: + from simplejson import loads as json_decode, dumps as json_encode + + +# API version and unique software ID +API_VERSION = 'DBC/Python v4.6' + +# Default CAPTCHA timeout and decode() polling interval +DEFAULT_TIMEOUT = 60 +DEFAULT_TOKEN_TIMEOUT = 120 +POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2] +DFLT_POLL_INTERVAL = 3 + +# Base HTTP API url +HTTP_BASE_URL = 'http://api.dbcapi.me/api' + +# Preferred HTTP API server's response content type, do not change +HTTP_RESPONSE_TYPE = 'application/json' + +# Socket API server's host & ports range +SOCKET_HOST = 'api.dbcapi.me' +SOCKET_PORTS = range(8123, 8131) + + +def _load_image(captcha): + if hasattr(captcha, 'read'): + img = captcha.read() + elif type(captcha) == bytearray: + img = captcha + else: + img = '' + try: + captcha_file = open(captcha, 'rb') + except Exception: + raise + else: + img = captcha_file.read() + captcha_file.close() + if not len(img): + raise ValueError('CAPTCHA image is empty') + elif imghdr.what(None, img) is None: + raise TypeError('Unknown CAPTCHA image type') + else: + return img + + +class AccessDeniedException(Exception): + pass + + +class Client(object): + + """Death by Captcha API Client.""" + + def __init__(self, username, password): + self.is_verbose = False + self.userpwd = {'username': username, 'password': password} + + def _log(self, cmd, msg=''): + if self.is_verbose: + print '%d %s %s' % (time.time(), cmd, msg.rstrip()) + return self + + def close(self): + pass + + def connect(self): + pass + + def get_user(self): + """Fetch user details -- ID, balance, rate and banned status.""" + raise NotImplementedError() + + def get_balance(self): + """Fetch user balance (in US cents).""" + return self.get_user().get('balance') + + def get_captcha(self, cid): + """Fetch a CAPTCHA details -- ID, text and correctness flag.""" + raise NotImplementedError() + + def get_text(self, cid): + """Fetch a CAPTCHA text.""" + return self.get_captcha(cid).get('text') or None + + def report(self, cid): + """Report a CAPTCHA as incorrectly solved.""" + raise NotImplementedError() + + def upload(self, captcha): + """Upload a CAPTCHA. + + Accepts file names and file-like objects. Returns CAPTCHA details + dict on success. + + """ + raise NotImplementedError() + + def decode(self, captcha=None, timeout=None, **kwargs): + """ + Try to solve a CAPTCHA. + + See Client.upload() for arguments details. + + Uploads a CAPTCHA, polls for its status periodically with arbitrary + timeout (in seconds), returns CAPTCHA details if (correctly) solved. + """ + if not timeout: + if not captcha: + timeout = DEFAULT_TOKEN_TIMEOUT + else: + timeout = DEFAULT_TIMEOUT + + deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT) + uploaded_captcha = self.upload(captcha, **kwargs) + if uploaded_captcha: + intvl_idx = 0 # POLL_INTERVAL index + while deadline > time.time() and not uploaded_captcha.get('text'): + intvl, intvl_idx = self._get_poll_interval(intvl_idx) + time.sleep(intvl) + pulled = self.get_captcha(uploaded_captcha['captcha']) + if pulled['captcha'] == uploaded_captcha['captcha']: + uploaded_captcha = pulled + if uploaded_captcha.get('text') and \ + uploaded_captcha.get('is_correct'): + return uploaded_captcha + + def _get_poll_interval(self, idx): + """Returns poll interval and next index depending on index provided""" + + if len(POLLS_INTERVAL) > idx: + intvl = POLLS_INTERVAL[idx] + else: + intvl = DFLT_POLL_INTERVAL + idx += 1 + + return intvl, idx + + +class HttpClient(Client): + + """Death by Captcha HTTP API client.""" + + def __init__(self, *args): + Client.__init__(self, *args) + self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) + + def _call(self, cmd, payload=None, headers=None): + if headers is None: + headers = {} + headers['Accept'] = HTTP_RESPONSE_TYPE + headers['User-Agent'] = API_VERSION + if hasattr(payload, 'items'): + payload = urllib.urlencode(payload) + self._log('SEND', '%s %d %s' % (cmd, len(payload), payload)) + else: + self._log('SEND', '%s' % cmd) + if payload is not None: + headers['Content-Length'] = len(payload) + try: + response = self.opener.open(urllib2.Request( + HTTP_BASE_URL + '/' + cmd.strip('/'), + data=payload, + headers=headers + )).read() + except urllib2.HTTPError, err: + if 403 == err.code: + raise AccessDeniedException('Access denied, please check' + ' your credentials and/or balance') + elif 400 == err.code or 413 == err.code: + raise ValueError("CAPTCHA was rejected by the service, check" + " if it's a valid image") + elif 503 == err.code: + raise OverflowError("CAPTCHA was rejected due to service" + " overload, try again later") + else: + raise err + else: + self._log('RECV', '%d %s' % (len(response), response)) + try: + return json_decode(response) + except Exception: + raise RuntimeError('Invalid API response') + return {} + + def get_user(self): + return self._call('user', self.userpwd.copy()) or {'user': 0} + + def get_captcha(self, cid): + return self._call('captcha/%d' % cid) or {'captcha': 0} + + def report(self, cid): + return not self._call('captcha/%d/report' % cid, + self.userpwd.copy()).get('is_correct') + + def upload(self, captcha=None, **kwargs): + boundary = binascii.hexlify(os.urandom(16)) + banner = kwargs.get('banner', '') + if banner: + kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner)) + body = '\r\n'.join(('\r\n'.join(( + '--%s' % boundary, + 'Content-Disposition: form-data; name="%s"' % k, + 'Content-Type: text/plain', + 'Content-Length: %d' % len(str(v)), + '', + str(v) + ))) for k, v in self.userpwd.items()) + + body += '\r\n'.join(('\r\n'.join(( + '--%s' % boundary, + 'Content-Disposition: form-data; name="%s"' % k, + 'Content-Type: text/plain', + 'Content-Length: %d' % len(str(v)), + '', + str(v) + ))) for k, v in kwargs.items()) + + if captcha: + img = _load_image(captcha) + body += '\r\n'.join(( + '', + '--%s' % boundary, + 'Content-Disposition: form-data; name="captchafile"; ' + 'filename="captcha"', + 'Content-Type: application/octet-stream', + 'Content-Length: %d' % len(img), + '', + img, + '--%s--' % boundary, + '' + )) + + response = self._call('captcha', body, { + 'Content-Type': 'multipart/form-data; boundary="%s"' % boundary + }) or {} + if response.get('captcha'): + return response + + +class SocketClient(Client): + + """Death by Captcha socket API client.""" + + TERMINATOR = '\r\n' + + def __init__(self, *args): + Client.__init__(self, *args) + self.socket_lock = threading.Lock() + self.socket = None + + def close(self): + if self.socket: + self._log('CLOSE') + try: + self.socket.shutdown(socket.SHUT_RDWR) + except socket.error: + pass + finally: + self.socket.close() + self.socket = None + + def connect(self): + if not self.socket: + self._log('CONN') + host = (socket.gethostbyname(SOCKET_HOST), + random.choice(SOCKET_PORTS)) + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.socket.settimeout(0) + try: + self.socket.connect(host) + except socket.error, err: + if (err.args[0] not in + (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): + self.close() + raise err + return self.socket + + def __del__(self): + self.close() + + def _sendrecv(self, sock, buf): + self._log('SEND', buf) + fds = [sock] + buf += self.TERMINATOR + response = '' + intvl_idx = 0 + while True: + intvl, intvl_idx = self._get_poll_interval(intvl_idx) + rds, wrs, exs = select.select((not buf and fds) or [], + (buf and fds) or [], + fds, + intvl) + if exs: + raise IOError('select() failed') + try: + if wrs: + while buf: + buf = buf[wrs[0].send(buf):] + elif rds: + while True: + s = rds[0].recv(256) + if not s: + raise IOError('recv(): connection lost') + else: + response += s + except socket.error, err: + if (err.args[0] not in + (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): + raise err + if response.endswith(self.TERMINATOR): + self._log('RECV', response) + return response.rstrip(self.TERMINATOR) + raise IOError('send/recv timed out') + + def _call(self, cmd, data=None): + if data is None: + data = {} + data['cmd'] = cmd + data['version'] = API_VERSION + request = json_encode(data) + + response = None + for _ in range(2): + if not self.socket and cmd != 'login': + self._call('login', self.userpwd.copy()) + self.socket_lock.acquire() + try: + sock = self.connect() + response = self._sendrecv(sock, request) + except IOError, err: + sys.stderr.write(str(err) + "\n") + self.close() + except socket.error, err: + sys.stderr.write(str(err) + "\n") + self.close() + raise IOError('Connection refused') + else: + break + finally: + self.socket_lock.release() + + if response is None: + raise IOError('Connection lost or timed out during API request') + + try: + response = json_decode(response) + except Exception: + raise RuntimeError('Invalid API response') + + if not response.get('error'): + return response + + error = response['error'] + if error in ('not-logged-in', 'invalid-credentials'): + raise AccessDeniedException('Access denied, check your credentials') + elif 'banned' == error: + raise AccessDeniedException('Access denied, account is suspended') + elif 'insufficient-funds' == error: + raise AccessDeniedException( + 'CAPTCHA was rejected due to low balance') + elif 'invalid-captcha' == error: + raise ValueError('CAPTCHA is not a valid image') + elif 'service-overload' == error: + raise OverflowError( + 'CAPTCHA was rejected due to service overload, try again later') + else: + self.socket_lock.acquire() + self.close() + self.socket_lock.release() + raise RuntimeError('API server error occured: %s' % error) + + def get_user(self): + return self._call('user') or {'user': 0} + + def get_captcha(self, cid): + return self._call('captcha', {'captcha': cid}) or {'captcha': 0} + + def upload(self, captcha=None, **kwargs): + data = {} + if captcha: + data['captcha'] = base64.b64encode(_load_image(captcha)) + if kwargs: + banner = kwargs.get('banner', '') + if banner: + kwargs['banner'] = base64.b64encode(_load_image(banner)) + data.update(kwargs) + response = self._call('upload', data) + if response.get('captcha'): + uploaded_captcha = dict( + (k, response.get(k)) + for k in ('captcha', 'text', 'is_correct') + ) + if not uploaded_captcha['text']: + uploaded_captcha['text'] = None + return uploaded_captcha + + def report(self, cid): + return not self._call('report', {'captcha': cid}).get('is_correct') + + +if '__main__' == __name__: + # Put your DBC username & password here: + # client = HttpClient(sys.argv[1], sys.argv[2]) + client = SocketClient(sys.argv[1], sys.argv[2]) + client.is_verbose = True + + print 'Your balance is %s US cents' % client.get_balance() + + for fn in sys.argv[3:]: + try: + # Put your CAPTCHA image file name or file-like object, and optional + # solving timeout (in seconds) here: + captcha = client.decode(fn, DEFAULT_TIMEOUT) + except Exception, e: + sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, )) + captcha = None + + if captcha: + print 'CAPTCHA %d solved: %s' % \ + (captcha['captcha'], captcha['text']) + + # Report as incorrectly solved if needed. Make sure the CAPTCHA was + # in fact incorrectly solved! + # try: + # client.report(captcha['captcha']) + # except Exception, e: + # sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, )) diff --git a/libs/python_anticaptcha/__init__.py b/libs/python_anticaptcha/__init__.py new file mode 100644 index 000000000..ac9f0550f --- /dev/null +++ b/libs/python_anticaptcha/__init__.py @@ -0,0 +1,7 @@ +from .base import AnticaptchaClient +from .tasks import NoCaptchaTask, NoCaptchaTaskProxylessTask, ImageToTextTask, FunCaptchaTask +from .proxy import Proxy +from .exceptions import AnticaptchaException +from .fields import SimpleText, Image, WebLink, TextInput, Textarea, Checkbox, Select, Radio, ImageUpload + +AnticatpchaException = AnticaptchaException \ No newline at end of file diff --git a/libs/python_anticaptcha/base.py b/libs/python_anticaptcha/base.py new file mode 100644 index 000000000..fca8cdf53 --- /dev/null +++ b/libs/python_anticaptcha/base.py @@ -0,0 +1,114 @@ +import requests +import time + +from six.moves.urllib_parse import urljoin +from .exceptions import AnticaptchaException + +SLEEP_EVERY_CHECK_FINISHED = 3 +MAXIMUM_JOIN_TIME = 60 * 5 + + +class Job(object): + client = None + task_id = None + _last_result = None + + def __init__(self, client, task_id): + self.client = client + self.task_id = task_id + + def _update(self): + self._last_result = self.client.getTaskResult(self.task_id) + + def check_is_ready(self): + self._update() + return self._last_result['status'] == 'ready' + + def get_solution_response(self): # Recaptcha + return self._last_result['solution']['gRecaptchaResponse'] + + def get_token_response(self): # Funcaptcha + return self._last_result['solution']['token'] + + def get_answers(self): + return self._last_result['solution']['answers'] + + def get_captcha_text(self): # Image + return self._last_result['solution']['text'] + + def report_incorrect(self): + return self.client.reportIncorrectImage(self.task_id) + + def join(self, maximum_time=None): + elapsed_time = 0 + maximum_time = maximum_time or MAXIMUM_JOIN_TIME + while not self.check_is_ready(): + time.sleep(SLEEP_EVERY_CHECK_FINISHED) + elapsed_time += SLEEP_EVERY_CHECK_FINISHED + if elapsed_time is not None and elapsed_time > maximum_time: + raise AnticaptchaException(None, 250, + "The execution time exceeded a maximum time of {} seconds. It takes {} seconds.".format( + maximum_time, elapsed_time)) + + +class AnticaptchaClient(object): + client_key = None + CREATE_TASK_URL = "/createTask" + TASK_RESULT_URL = "/getTaskResult" + BALANCE_URL = "/getBalance" + REPORT_IMAGE_URL = "/reportIncorrectImageCaptcha" + SOFT_ID = 847 + language_pool = "en" + + def __init__(self, client_key, language_pool="en", host="api.anti-captcha.com", use_ssl=True): + self.client_key = client_key + self.language_pool = language_pool + self.base_url = "{proto}://{host}/".format(proto="https" if use_ssl else "http", + host=host) + self.session = requests.Session() + + @property + def client_ip(self): + if not hasattr(self, '_client_ip'): + self._client_ip = self.session.get('http://httpbin.org/ip').json()['origin'] + return self._client_ip + + def _check_response(self, response): + if response.get('errorId', False) == 11: + response['errorDescription'] = "{} Your missing IP address is {}.".format(response['errorDescription'], + self.client_ip) + if response.get('errorId', False): + raise AnticaptchaException(response['errorId'], + response['errorCode'], + response['errorDescription']) + + def createTask(self, task): + request = {"clientKey": self.client_key, + "task": task.serialize(), + "softId": self.SOFT_ID, + "languagePool": self.language_pool, + } + response = self.session.post(urljoin(self.base_url, self.CREATE_TASK_URL), json=request).json() + self._check_response(response) + return Job(self, response['taskId']) + + def getTaskResult(self, task_id): + request = {"clientKey": self.client_key, + "taskId": task_id} + response = self.session.post(urljoin(self.base_url, self.TASK_RESULT_URL), json=request).json() + self._check_response(response) + return response + + def getBalance(self): + request = {"clientKey": self.client_key} + response = self.session.post(urljoin(self.base_url, self.BALANCE_URL), json=request).json() + self._check_response(response) + return response['balance'] + + def reportIncorrectImage(self, task_id): + request = {"clientKey": self.client_key, + "taskId": task_id + } + response = self.session.post(urljoin(self.base_url, self.REPORT_IMAGE_URL), json=request).json() + self._check_response(response) + return response.get('status', False) != False diff --git a/libs/python_anticaptcha/exceptions.py b/libs/python_anticaptcha/exceptions.py new file mode 100644 index 000000000..f37eb372c --- /dev/null +++ b/libs/python_anticaptcha/exceptions.py @@ -0,0 +1,23 @@ +class AnticaptchaException(Exception): + def __init__(self, error_id, error_code, error_description, *args): + super(AnticaptchaException, self).__init__("[{}:{}]{}".format(error_code, error_id, error_description)) + self.error_description = error_description + self.error_id = error_id + self.error_code = error_code + + +AnticatpchaException = AnticaptchaException + + +class InvalidWidthException(AnticaptchaException): + def __init__(self, width): + self.width = width + msg = 'Invalid width (%s). Can be one of these: 100, 50, 33, 25.' % (self.width,) + super(InvalidWidthException, self).__init__("AC-1", 1, msg) + + +class MissingNameException(AnticaptchaException): + def __init__(self, cls): + self.cls = cls + msg = 'Missing name data in {0}. Provide {0}.__init__(name="X") or {0}.serialize(name="X")'.format(str(self.cls)) + super(MissingNameException, self).__init__("AC-2", 2, msg) diff --git a/libs/python_anticaptcha/fields.py b/libs/python_anticaptcha/fields.py new file mode 100644 index 000000000..9e6245946 --- /dev/null +++ b/libs/python_anticaptcha/fields.py @@ -0,0 +1,199 @@ +import six +from python_anticaptcha.exceptions import InvalidWidthException, MissingNameException + + +class BaseField(object): + label = None + labelHint = None + + def serialize(self, name=None): + data = {} + if self.label: + data['label'] = self.label or False + if self.labelHint: + data['labelHint'] = self.labelHint or False + return data + + +class NameBaseField(BaseField): + name = None + + def serialize(self, name=None): + data = super(NameBaseField, self).serialize(name) + if name: + data['name'] = name + elif self.name: + data['name'] = self.name + else: + raise MissingNameException(cls=self.__class__) + return data + + +class SimpleText(BaseField): + contentType = 'text' + + def __init__(self, content, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.content = content + self.width = width + + def serialize(self, name=None): + data = super(SimpleText, self).serialize(name) + data['contentType'] = self.contentType + data['content'] = self.content + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + data['inputOptions'] = {} + data['width'] = self.width + return data + + +class Image(BaseField): + contentType = 'image' + + def __init__(self, imageUrl, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + self.imageUrl = imageUrl + + def serialize(self, name=None): + data = super(Image, self).serialize(name) + data['contentType'] = self.contentType + data['content'] = self.imageUrl + return data + + +class WebLink(BaseField): + contentType = 'link' + + def __init__(self, linkText, linkUrl, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.linkText = linkText + self.linkUrl = linkUrl + + self.width = width + + def serialize(self, name=None): + data = super(WebLink, self).serialize(name) + data['contentType'] = self.contentType + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + data['inputOptions'] = {} + data['width'] = self.width + + data.update({'content': {'url': self.linkUrl, + 'text': self.linkText}}) + + return data + + +class TextInput(NameBaseField): + def __init__(self, placeHolder=None, label=None, labelHint=None, width=None): + self.label = label + self.labelHint = labelHint + + self.placeHolder = placeHolder + + self.width = width + + def serialize(self, name=None): + data = super(TextInput, self).serialize(name) + data['inputType'] = 'text' + + data['inputOptions'] = {} + + if self.width: + if self.width not in [100, 50, 33, 25]: + raise InvalidWidthException(self.width) + + data['inputOptions']['width'] = str(self.width) + + if self.placeHolder: + data['inputOptions']['placeHolder'] = self.placeHolder + return data + + +class Textarea(NameBaseField): + def __init__(self, placeHolder=None, rows=None, label=None, width=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + self.placeHolder = placeHolder + self.rows = rows + self.width = width + + def serialize(self, name=None): + data = super(Textarea, self).serialize(name) + data['inputType'] = 'textarea' + data['inputOptions'] = {} + if self.rows: + data['inputOptions']['rows'] = str(self.rows) + if self.placeHolder: + data['inputOptions']['placeHolder'] = self.placeHolder + if self.width: + data['inputOptions']['width'] = str(self.width) + return data + + +class Checkbox(NameBaseField): + def __init__(self, text, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + self.text = text + + def serialize(self, name=None): + data = super(Checkbox, self).serialize(name) + data['inputType'] = 'checkbox' + data['inputOptions'] = {'label': self.text} + return data + + +class Select(NameBaseField): + type = 'select' + + def __init__(self, label=None, choices=None, labelHint=None): + self.label = label + self.labelHint = labelHint + self.choices = choices or () + + def get_choices(self): + for choice in self.choices: + if isinstance(choice, six.text_type): + yield choice, choice + else: + yield choice + + def serialize(self, name=None): + data = super(Select, self).serialize(name) + data['inputType'] = self.type + + data['inputOptions'] = [] + for value, caption in self.get_choices(): + data['inputOptions'].append({"value": value, + "caption": caption}) + + return data + + +class Radio(Select): + type = 'radio' + + +class ImageUpload(NameBaseField): + def __init__(self, label=None, labelHint=None): + self.label = label + self.labelHint = labelHint + + def serialize(self, name=None): + data = super(ImageUpload, self).serialize(name) + data['inputType'] = 'imageUpload' + return data diff --git a/libs/python_anticaptcha/proxy.py b/libs/python_anticaptcha/proxy.py new file mode 100644 index 000000000..907232f7e --- /dev/null +++ b/libs/python_anticaptcha/proxy.py @@ -0,0 +1,28 @@ +from six.moves.urllib_parse import urlparse + + +class Proxy(object): + def __init__(self, proxy_type, proxy_address, proxy_port, proxy_login, proxy_password): + self.proxyType = proxy_type + self.proxyAddress = proxy_address + self.proxyPort = proxy_port + self.proxyLogin = proxy_login + self.proxyPassword = proxy_password + + def serialize(self): + result = {'proxyType': self.proxyType, + 'proxyAddress': self.proxyAddress, + 'proxyPort': self.proxyPort} + if self.proxyLogin or self.proxyPassword: + result['proxyLogin'] = self.proxyLogin + result['proxyPassword'] = self.proxyPassword + return result + + @classmethod + def parse_url(cls, url): + parsed = urlparse(url) + return cls(proxy_type=parsed.scheme, + proxy_address=parsed.hostname, + proxy_port=parsed.port, + proxy_login=parsed.username, + proxy_password=parsed.password) diff --git a/libs/python_anticaptcha/tasks.py b/libs/python_anticaptcha/tasks.py new file mode 100644 index 000000000..57462763f --- /dev/null +++ b/libs/python_anticaptcha/tasks.py @@ -0,0 +1,128 @@ +import base64 +from .fields import BaseField + + +class BaseTask(object): + def serialize(self, **result): + return result + + +class ProxyMixin(BaseTask): + def __init__(self, *args, **kwargs): + self.proxy = kwargs.pop('proxy') + self.userAgent = kwargs.pop('user_agent') + self.cookies = kwargs.pop('cookies', '') + super(ProxyMixin, self).__init__(*args, **kwargs) + + def serialize(self, **result): + result = super(ProxyMixin, self).serialize(**result) + result.update(self.proxy.serialize()) + result['userAgent'] = self.userAgent + if self.cookies: + result['cookies'] = self.cookies + return result + + +class NoCaptchaTaskProxylessTask(BaseTask): + type = "NoCaptchaTaskProxyless" + websiteURL = None + websiteKey = None + websiteSToken = None + + def __init__(self, website_url, website_key, website_s_token=None, is_invisible=None): + self.websiteURL = website_url + self.websiteKey = website_key + self.websiteSToken = website_s_token + self.isInvisible = is_invisible + + def serialize(self): + data = {'type': self.type, + 'websiteURL': self.websiteURL, + 'websiteKey': self.websiteKey} + if self.websiteSToken is not None: + data['websiteSToken'] = self.websiteSToken + if self.isInvisible is not None: + data['isInvisible'] = self.isInvisible + return data + + +class FunCaptchaTask(ProxyMixin): + type = "FunCaptchaTask" + websiteURL = None + websiteKey = None + + def __init__(self, website_url, website_key, *args, **kwargs): + self.websiteURL = website_url + self.websiteKey = website_key + super(FunCaptchaTask, self).__init__(*args, **kwargs) + + def serialize(self, **result): + result = super(FunCaptchaTask, self).serialize(**result) + result.update({'type': self.type, + 'websiteURL': self.websiteURL, + 'websitePublicKey': self.websiteKey}) + return result + + +class NoCaptchaTask(ProxyMixin, NoCaptchaTaskProxylessTask): + type = "NoCaptchaTask" + + +class ImageToTextTask(object): + type = "ImageToTextTask" + fp = None + phrase = None + case = None + numeric = None + math = None + minLength = None + maxLength = None + + def __init__(self, fp, phrase=None, case=None, numeric=None, math=None, min_length=None, max_length=None): + self.fp = fp + self.phrase = phrase + self.case = case + self.numeric = numeric + self.math = math + self.minLength = min_length + self.maxLength = max_length + + def serialize(self): + return {'type': self.type, + 'body': base64.b64encode(self.fp.read()).decode('utf-8'), + 'phrase': self.phrase, + 'case': self.case, + 'numeric': self.numeric, + 'math': self.math, + 'minLength': self.minLength, + 'maxLength': self.maxLength} + + +class CustomCaptchaTask(BaseTask): + type = 'CustomCaptchaTask' + imageUrl = None + assignment = None + form = None + + def __init__(self, imageUrl, form=None, assignment=None): + self.imageUrl = imageUrl + self.form = form or {} + self.assignment = assignment + + def serialize(self): + data = super(CustomCaptchaTask, self).serialize() + data.update({'type': self.type, + 'imageUrl': self.imageUrl}) + if self.form: + forms = [] + for name, field in self.form.items(): + if isinstance(field, BaseField): + forms.append(field.serialize(name)) + else: + field = field.copy() + field['name'] = name + forms.append(field) + data['forms'] = forms + if self.assignment: + data['assignment'] = self.assignment + return data diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 5dda9fb3c..df38b4e09 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -518,10 +518,20 @@ def scan_video(path, dont_use_actual_file=False, hints=None, providers=None, ski hints["expected_title"] = [hints["title"]] guessed_result = guessit(guess_from, options=hints) + logger.debug('GuessIt found: %s', json.dumps(guessed_result, cls=GuessitEncoder, indent=4, ensure_ascii=False)) video = Video.fromguess(path, guessed_result) video.hints = hints + # get possibly alternative title from the filename itself + alt_guess = guessit(filename, options=hints) + if "title" in alt_guess and alt_guess["title"] != guessed_result["title"]: + if video_type == "episode": + video.alternative_series.append(alt_guess["title"]) + else: + video.alternative_titles.append(alt_guess["title"]) + logger.debug("Adding alternative title: %s", alt_guess["title"]) + if dont_use_actual_file: return video diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index d6fddb358..c813f5585 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -8,10 +8,18 @@ import requests import xmlrpclib import dns.resolver -from requests import Session, exceptions +from requests import exceptions from urllib3.util import connection from retry.api import retry_call from exceptions import APIThrottled +from dogpile.cache.api import NO_VALUE +from subliminal.cache import region +from cfscrape import CloudflareScraper + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse from subzero.lib.io import get_viable_encoding @@ -30,24 +38,58 @@ custom_resolver = dns.resolver.Resolver(configure=False) custom_resolver.nameservers = ['8.8.8.8', '1.1.1.1'] -class CertifiSession(Session): +class CertifiSession(CloudflareScraper): timeout = 10 def __init__(self): super(CertifiSession, self).__init__() self.verify = pem_file - - def request(self, *args, **kwargs): + self.headers.update({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache', + 'DNT': '1' + }) + + def request(self, method, url, *args, **kwargs): if kwargs.get('timeout') is None: kwargs['timeout'] = self.timeout - return super(CertifiSession, self).request(*args, **kwargs) + + parsed_url = urlparse(url) + domain = parsed_url.netloc + + cache_key = "cf_data_%s" % domain + + if not self.cookies.get("__cfduid", "", domain=domain): + cf_data = region.get(cache_key) + if cf_data is not NO_VALUE: + cf_cookies, user_agent = cf_data + logger.debug("Trying to use old cf data for %s: %s", domain, cf_data) + for cookie, value in cf_cookies.iteritems(): + self.cookies.set(cookie, value, domain=domain) + + self.headers['User-Agent'] = user_agent + + ret = super(CertifiSession, self).request(method, url, *args, **kwargs) + try: + cf_data = self.get_live_tokens(domain) + except: + pass + else: + if cf_data != region.get(cache_key) and self.cookies.get("__cfduid", "", domain=domain)\ + and self.cookies.get("cf_clearance", "", domain=domain): + logger.debug("Storing cf data for %s: %s", domain, cf_data) + region.set(cache_key, cf_data) + + return ret class RetryingSession(CertifiSession): proxied_functions = ("get", "post") def __init__(self): - super(CertifiSession, self).__init__() + super(RetryingSession, self).__init__() self.verify = pem_file proxy = os.environ.get('SZ_HTTP_PROXY') @@ -62,7 +104,7 @@ class RetryingSession(CertifiSession): # fixme: may be a little loud logger.debug("Using proxy %s for: %s", self.proxies["http"], args[0]) - return retry_call(getattr(super(CertifiSession, self), method), fargs=args, fkwargs=kwargs, tries=3, delay=5, + return retry_call(getattr(super(RetryingSession, self), method), fargs=args, fkwargs=kwargs, tries=3, delay=5, exceptions=(exceptions.ConnectionError, exceptions.ProxyError, exceptions.SSLError, diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py new file mode 100644 index 000000000..b2cef63b3 --- /dev/null +++ b/libs/subliminal_patch/pitcher.py @@ -0,0 +1,257 @@ +# coding=utf-8 + +import os +import time +import logging +import json +from subliminal.cache import region +from dogpile.cache.api import NO_VALUE +from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ + Proxy +from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT + + +logger = logging.getLogger(__name__) + + +class PitcherRegistry(object): + pitchers = [] + pitchers_by_key = {} + + def register(self, cls): + idx = len(self.pitchers) + self.pitchers.append(cls) + key = "%s_%s" % (cls.name, cls.needs_proxy) + key_by_source = "%s_%s" % (cls.source, cls.needs_proxy) + self.pitchers_by_key[key] = idx + self.pitchers_by_key[key_by_source] = idx + return cls + + def get_pitcher(self, name_or_site=None, with_proxy=False): + name_or_site = name_or_site or os.environ.get("ANTICAPTCHA_CLASS") + if not name_or_site: + raise Exception("AntiCaptcha class not given, exiting") + + key = "%s_%s" % (name_or_site, with_proxy) + + if key not in self.pitchers_by_key: + raise Exception("Pitcher %s not found (proxy: %s)" % (name_or_site, with_proxy)) + + return self.pitchers[self.pitchers_by_key.get(key)] + + +registry = pitchers = PitcherRegistry() + + +class Pitcher(object): + name = None + source = None + needs_proxy = False + tries = 3 + job = None + client = None + client_key = None + website_url = None + website_key = None + website_name = None + solve_time = None + success = False + + def __init__(self, website_name, website_url, website_key, tries=3, client_key=None, *args, **kwargs): + self.tries = tries + self.client_key = client_key or os.environ.get("ANTICAPTCHA_ACCOUNT_KEY") + if not self.client_key: + raise Exception("AntiCaptcha key not given, exiting") + + self.website_name = website_name + self.website_key = website_key + self.website_url = website_url + self.success = False + self.solve_time = None + + def get_client(self): + raise NotImplementedError + + def get_job(self): + raise NotImplementedError + + def _throw(self): + self.client = self.get_client() + self.job = self.get_job() + + def throw(self): + t = time.time() + data = self._throw() + if self.success: + self.solve_time = time.time() - t + logger.info("%s: Solving took %ss", self.website_name, int(self.solve_time)) + return data + + +@registry.register +class AntiCaptchaProxyLessPitcher(Pitcher): + name = "AntiCaptchaProxyLess" + source = "anti-captcha.com" + host = "api.anti-captcha.com" + language_pool = "en" + tries = 5 + use_ssl = True + is_invisible = False + + def __init__(self, website_name, website_url, website_key, tries=3, host=None, language_pool=None, + use_ssl=True, is_invisible=False, *args, **kwargs): + super(AntiCaptchaProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries, *args, + **kwargs) + self.host = host or self.host + self.language_pool = language_pool or self.language_pool + self.use_ssl = use_ssl + self.is_invisible = is_invisible + + def get_client(self): + return AnticaptchaClient(self.client_key, self.language_pool, self.host, self.use_ssl) + + def get_job(self): + task = NoCaptchaTaskProxylessTask(website_url=self.website_url, website_key=self.website_key, + is_invisible=self.is_invisible) + return self.client.createTask(task) + + def _throw(self): + for i in range(self.tries): + try: + super(AntiCaptchaProxyLessPitcher, self)._throw() + self.job.join() + ret = self.job.get_solution_response() + if ret: + self.success = True + return ret + except AnticaptchaException as e: + if i >= self.tries - 1: + logger.error("%s: Captcha solving finally failed. Exiting", self.website_name) + return + + if e.error_code == 'ERROR_ZERO_BALANCE': + logger.error("%s: No balance left on captcha solving service. Exiting", self.website_name) + return + + elif e.error_code == 'ERROR_NO_SLOT_AVAILABLE': + logger.info("%s: No captcha solving slot available, retrying", self.website_name) + time.sleep(5.0) + continue + + elif e.error_code == 'ERROR_KEY_DOES_NOT_EXIST': + logger.error("%s: Bad AntiCaptcha API key", self.website_name) + return + + elif e.error_id is None and e.error_code == 250: + # timeout + if i < self.tries: + logger.info("%s: Captcha solving timed out, retrying", self.website_name) + time.sleep(1.0) + continue + else: + logger.error("%s: Captcha solving timed out three times; bailing out", self.website_name) + return + raise + + +@registry.register +class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher): + name = "AntiCaptcha" + proxy = None + needs_proxy = True + user_agent = None + cookies = None + + def __init__(self, *args, **kwargs): + self.proxy = Proxy.parse_url(kwargs.pop("proxy")) + self.user_agent = kwargs.pop("user_agent") + cookies = kwargs.pop("cookies", {}) + if isinstance(cookies, dict): + self.cookies = ";".join(["%s=%s" % (k, v) for k, v in cookies.iteritems()]) + + super(AntiCaptchaPitcher, self).__init__(*args, **kwargs) + + def get_job(self): + task = NoCaptchaTask(website_url=self.website_url, website_key=self.website_key, proxy=self.proxy, + user_agent=self.user_agent, cookies=self.cookies, is_invisible=self.is_invisible) + return self.client.createTask(task) + + +@registry.register +class DBCProxyLessPitcher(Pitcher): + name = "DeathByCaptchaProxyLess" + source = "deathbycaptcha.com" + username = None + password = None + + def __init__(self, website_name, website_url, website_key, + timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs): + super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries) + + self.username, self.password = self.client_key.split(":", 1) + self.timeout = timeout + + def get_client(self): + return DBCClient(self.username, self.password) + + def get_job(self): + pass + + @property + def payload_dict(self): + return { + "googlekey": self.website_key, + "pageurl": self.website_url + } + + def _throw(self): + super(DBCProxyLessPitcher, self)._throw() + payload = json.dumps(self.payload_dict) + for i in range(self.tries): + try: + #balance = self.client.get_balance() + data = self.client.decode(timeout=self.timeout, type=4, token_params=payload) + if data and data["is_correct"] and data["text"]: + self.success = True + return data["text"] + except: + raise + + +@registry.register +class DBCPitcher(DBCProxyLessPitcher): + name = "DeathByCaptcha" + proxy = None + needs_proxy = True + proxy_type = "HTTP" + + def __init__(self, *args, **kwargs): + self.proxy = kwargs.pop("proxy") + super(DBCPitcher, self).__init__(*args, **kwargs) + + @property + def payload_dict(self): + payload = super(DBCPitcher, self).payload_dict + payload.update({ + "proxytype": self.proxy_type, + "proxy": self.proxy + }) + return payload + + +def load_verification(site_name, session, callback=lambda x: None): + ccks = region.get("%s_data" % site_name, expiration_time=15552000) # 6m + if ccks != NO_VALUE: + cookies, user_agent = ccks + logger.debug("%s: Re-using previous user agent: %s", site_name.capitalize(), user_agent) + session.headers["User-Agent"] = user_agent + try: + session.cookies._cookies.update(cookies) + return callback(region) + except: + return False + return False + + +def store_verification(site_name, session): + region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"])) diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index 51913d887..2d556d877 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -4,18 +4,17 @@ import re import datetime import subliminal import time + from random import randint -from dogpile.cache.api import NO_VALUE from requests import Session - -from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError +from subliminal.cache import region +from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError from subliminal.providers.addic7ed import Addic7edProvider as _Addic7edProvider, \ Addic7edSubtitle as _Addic7edSubtitle, ParserBeautifulSoup, show_cells_re -from subliminal.cache import region from subliminal.subtitle import fix_line_ending from subliminal_patch.utils import sanitize from subliminal_patch.exceptions import TooManyRequests - +from subliminal_patch.pitcher import pitchers, load_verification, store_verification from subzero.language import Language logger = logging.getLogger(__name__) @@ -64,6 +63,7 @@ class Addic7edProvider(_Addic7edProvider): USE_ADDICTED_RANDOM_AGENTS = False hearing_impaired_verifiable = True subtitle_class = Addic7edSubtitle + server_url = 'https://www.addic7ed.com/' sanitize_characters = {'-', ':', '(', ')', '.', '/'} @@ -75,45 +75,76 @@ class Addic7edProvider(_Addic7edProvider): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ - if self.USE_ADDICTED_RANDOM_AGENTS: - from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST - logger.debug("Addic7ed: using random user agents") - self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] - self.session.headers['Referer'] = self.server_url + from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST + logger.debug("Addic7ed: using random user agents") + self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers['Referer'] = self.server_url # login if self.username and self.password: - ccks = region.get("addic7ed_cookies", expiration_time=86400) - if ccks != NO_VALUE: - try: - self.session.cookies._cookies.update(ccks) - r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10) - if r.status_code == 302: - logger.info('Addic7ed: Login expired') - region.delete("addic7ed_cookies") - else: - logger.info('Addic7ed: Reusing old login') - self.logged_in = True - return - except: - pass + def check_verification(cache_region): + rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, + headers={"Referer": self.server_url}) + if rr.status_code == 302: + logger.info('Addic7ed: Login expired') + cache_region.delete("addic7ed_data") + else: + logger.info('Addic7ed: Re-using old login') + self.logged_in = True + return True + + if load_verification("addic7ed", self.session, callback=check_verification): + return logger.info('Addic7ed: Logging in') - data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} - r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, - headers={"Referer": self.server_url + "login.php"}) + data = {'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', + 'remember': 'true'} + + tries = 0 + while tries < 3: + r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) + if "grecaptcha" in r.content: + logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' + 'happen once every so often') + + site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) + if not site_key: + logger.error("Addic7ed: Captcha site-key not found!") + return - if "relax, slow down" in r.content: - raise TooManyRequests(self.username) + pitcher = pitchers.get_pitcher()("Addic7ed", self.server_url + 'login.php', site_key, + user_agent=self.session.headers["User-Agent"], + cookies=self.session.cookies.get_dict(), + is_invisible=True) - if r.status_code != 302: - raise AuthenticationError(self.username) + result = pitcher.throw() + if not result: + raise Exception("Addic7ed: Couldn't solve captcha!") + + data["recaptcha_response"] = result + + r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, + headers={"Referer": self.server_url + "login.php"}) + + if "relax, slow down" in r.content: + raise TooManyRequests(self.username) + + if r.status_code != 302: + if "User doesn't exist" in r.content and tries <= 2: + logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3) + tries += 1 + continue + + raise AuthenticationError(self.username) + break - region.set("addic7ed_cookies", self.session.cookies._cookies) + store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True + def terminate(self): + self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): @@ -140,7 +171,7 @@ class Addic7edProvider(_Addic7edProvider): # populate the show ids show_ids = {} - for show in soup.select('td.version > h3 > a[href^="/show/"]'): + for show in soup.select('td > h3 > a[href^="/show/"]'): show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) diff --git a/libs/subliminal_patch/providers/opensubtitles.py b/libs/subliminal_patch/providers/opensubtitles.py index 032b89058..4ce3aacea 100644 --- a/libs/subliminal_patch/providers/opensubtitles.py +++ b/libs/subliminal_patch/providers/opensubtitles.py @@ -11,8 +11,8 @@ from babelfish import language_converters from dogpile.cache.api import NO_VALUE from subliminal.exceptions import ConfigurationError, ServiceUnavailable from subliminal.providers.opensubtitles import OpenSubtitlesProvider as _OpenSubtitlesProvider,\ - OpenSubtitlesSubtitle as _OpenSubtitlesSubtitle, Episode, ServerProxy, Unauthorized, NoSession, \ - DownloadLimitReached, InvalidImdbid, UnknownUserAgent, DisabledUserAgent, OpenSubtitlesError + OpenSubtitlesSubtitle as _OpenSubtitlesSubtitle, Episode, Movie, ServerProxy, Unauthorized, NoSession, \ + DownloadLimitReached, InvalidImdbid, UnknownUserAgent, DisabledUserAgent, OpenSubtitlesError, sanitize from mixins import ProviderRetryMixin from subliminal.subtitle import fix_line_ending from subliminal_patch.http import SubZeroRequestsTransport @@ -45,6 +45,19 @@ class OpenSubtitlesSubtitle(_OpenSubtitlesSubtitle): def get_matches(self, video, hearing_impaired=False): matches = super(OpenSubtitlesSubtitle, self).get_matches(video) + # episode + if isinstance(video, Episode) and self.movie_kind == 'episode': + # series + if video.series and (sanitize(self.series_name) in ( + sanitize(name) for name in [video.series] + video.alternative_series)): + matches.add('series') + # movie + elif isinstance(video, Movie) and self.movie_kind == 'movie': + # title + if video.title and (sanitize(self.movie_name) in ( + sanitize(name) for name in [video.title] + video.alternative_titles)): + matches.add('title') + sub_fps = None try: sub_fps = float(self.fps) @@ -205,19 +218,19 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): season = episode = None if isinstance(video, Episode): - query = video.series + query = [video.series] + video.alternative_series season = video.season episode = episode = min(video.episode) if isinstance(video.episode, list) else video.episode if video.is_special: season = None episode = None - query = u"%s %s" % (video.series, video.title) + query = [u"%s %s" % (series, video.title) for series in [video.series] + video.alternative_series] logger.info("%s: Searching for special: %r", self.__class__, query) # elif ('opensubtitles' not in video.hashes or not video.size) and not video.imdb_id: # query = video.name.split(os.sep)[-1] else: - query = video.title + query = [video.title] + video.alternative_titles return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id, query=query, season=season, episode=episode, tag=video.original_name, @@ -238,9 +251,11 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): else: criteria.append({'imdbid': imdb_id[2:]}) if query and season and episode: - criteria.append({'query': query.replace('\'', ''), 'season': season, 'episode': episode}) + for q in query: + criteria.append({'query': q.replace('\'', ''), 'season': season, 'episode': episode}) elif query: - criteria.append({'query': query.replace('\'', '')}) + for q in query: + criteria.append({'query': q.replace('\'', '')}) if not criteria: raise ValueError('Not enough information') diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 38a97c579..d6a294cdb 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -5,6 +5,7 @@ import logging import os import time import inflect +import cfscrape from random import randint from zipfile import ZipFile @@ -12,7 +13,9 @@ from zipfile import ZipFile from babelfish import language_converters from guessit import guessit from requests import Session +from dogpile.cache.api import NO_VALUE from subliminal import Episode, ProviderError +from subliminal.cache import region from subliminal.utils import sanitize_release_group from subliminal_patch.providers import Provider from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin @@ -125,6 +128,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): self.session = Session() from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers['Referer'] = "https://subscene.com" def terminate(self): logger.info("Closing session") @@ -198,43 +202,48 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): subtitles = [] logger.debug(u"Searching for: %s", vfn) film = search(vfn, session=self.session) + if film and film.subtitles: logger.debug('Release results found: %s', len(film.subtitles)) subtitles = self.parse_results(video, film) else: logger.debug('No release results found') + time.sleep(self.search_throttle) + # re-search for episodes without explicit release name if isinstance(video, Episode): #term = u"%s S%02iE%02i" % (video.series, video.season, video.episode) - term = u"%s - %s Season" % (video.series, p.number_to_words("%sth" % video.season).capitalize()) - time.sleep(self.search_throttle) - logger.debug('Searching for alternative results: %s', term) - film = search(term, session=self.session, release=False) - if film and film.subtitles: - logger.debug('Alternative results found: %s', len(film.subtitles)) - subtitles += self.parse_results(video, film) - else: - logger.debug('No alternative results found') - - # packs - if video.season_fully_aired: - term = u"%s S%02i" % (video.series, video.season) - logger.debug('Searching for packs: %s', term) + for series in [video.series] + video.alternative_series: + term = u"%s - %s Season" % (series, p.number_to_words("%sth" % video.season).capitalize()) time.sleep(self.search_throttle) - film = search(term, session=self.session) + logger.debug('Searching for alternative results: %s', term) + film = search(term, session=self.session, release=False) if film and film.subtitles: - logger.debug('Pack results found: %s', len(film.subtitles)) + logger.debug('Alternative results found: %s', len(film.subtitles)) subtitles += self.parse_results(video, film) else: - logger.debug('No pack results found') - else: - logger.debug("Not searching for packs, because the season hasn't fully aired") + logger.debug('No alternative results found') + + # packs + if video.season_fully_aired: + term = u"%s S%02i" % (series, video.season) + logger.debug('Searching for packs: %s', term) + time.sleep(self.search_throttle) + film = search(term, session=self.session) + if film and film.subtitles: + logger.debug('Pack results found: %s', len(film.subtitles)) + subtitles += self.parse_results(video, film) + else: + logger.debug('No pack results found') + else: + logger.debug("Not searching for packs, because the season hasn't fully aired") else: - logger.debug('Searching for movie results: %s', video.title) - film = search(video.title, year=video.year, session=self.session, limit_to=None, release=False) - if film and film.subtitles: - subtitles += self.parse_results(video, film) + for title in [video.title] + video.alternative_titles: + logger.debug('Searching for movie results: %s', title) + film = search(title, year=video.year, session=self.session, limit_to=None, release=False) + if film and film.subtitles: + subtitles += self.parse_results(video, film) logger.info("%s subtitles found" % len(subtitles)) return subtitles diff --git a/libs/subliminal_patch/providers/subssabbz.py b/libs/subliminal_patch/providers/subssabbz.py index 17df5b975..ddcd47a7b 100644 --- a/libs/subliminal_patch/providers/subssabbz.py +++ b/libs/subliminal_patch/providers/subssabbz.py @@ -26,16 +26,22 @@ class SubsSabBzSubtitle(Subtitle): """SubsSabBz Subtitle.""" provider_name = 'subssabbz' - def __init__(self, langauge, filename, type): + def __init__(self, langauge, filename, type, video, link): super(SubsSabBzSubtitle, self).__init__(langauge) self.langauge = langauge self.filename = filename + self.page_link = link self.type = type + self.video = video @property def id(self): return self.filename + def make_picklable(self): + self.content = None + return self + def get_matches(self, video): matches = set() @@ -118,7 +124,7 @@ class SubsSabBzProvider(Provider): for row in rows[:10]: a_element_wrapper = row.find('td', { 'class': 'c2field' }) if a_element_wrapper: - element = row.find('a') + element = a_element_wrapper.find('a') if element: link = element.get('href') logger.info('Found subtitle link %r', link) @@ -130,15 +136,22 @@ class SubsSabBzProvider(Provider): return [s for l in languages for s in self.query(l, video)] def download_subtitle(self, subtitle): - pass + if subtitle.content: + pass + else: + seeking_subtitle_file = subtitle.filename + arch = self.download_archive_and_add_subtitle_files(subtitle.page_link, subtitle.language, subtitle.video) + for s in arch: + if s.filename == seeking_subtitle_file: + subtitle.content = s.content - def process_archive_subtitle_files(self, archiveStream, language, video): + def process_archive_subtitle_files(self, archiveStream, language, video, link): subtitles = [] type = 'episode' if isinstance(video, Episode) else 'movie' for file_name in archiveStream.namelist(): if file_name.lower().endswith(('.srt', '.sub')): logger.info('Found subtitle file %r', file_name) - subtitle = SubsSabBzSubtitle(language, file_name, type) + subtitle = SubsSabBzSubtitle(language, file_name, type, video, link) subtitle.content = archiveStream.read(file_name) subtitles.append(subtitle) return subtitles @@ -152,8 +165,8 @@ class SubsSabBzProvider(Provider): archive_stream = io.BytesIO(request.content) if is_rarfile(archive_stream): - return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) + return self.process_archive_subtitle_files( RarFile(archive_stream), language, video, link ) elif is_zipfile(archive_stream): - return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) + return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video, link ) else: raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/subsunacs.py b/libs/subliminal_patch/providers/subsunacs.py index bbc41f520..d616901eb 100644 --- a/libs/subliminal_patch/providers/subsunacs.py +++ b/libs/subliminal_patch/providers/subsunacs.py @@ -26,19 +26,25 @@ class SubsUnacsSubtitle(Subtitle): """SubsUnacs Subtitle.""" provider_name = 'subsunacs' - def __init__(self, langauge, filename, type): + def __init__(self, langauge, filename, type, video, link): super(SubsUnacsSubtitle, self).__init__(langauge) self.langauge = langauge self.filename = filename + self.page_link = link self.type = type + self.video = video @property def id(self): return self.filename + def make_picklable(self): + self.content = None + return self + def get_matches(self, video): matches = set() - + video_filename = video.name video_filename = os.path.basename(video_filename) video_filename, _ = os.path.splitext(video_filename) @@ -77,11 +83,11 @@ class SubsUnacsProvider(Provider): def terminate(self): self.session.close() - + def query(self, language, video): subtitles = [] isEpisode = isinstance(video, Episode) - + params = { 'm': '', 'l': 0, @@ -117,7 +123,7 @@ class SubsUnacsProvider(Provider): soup = BeautifulSoup(response.content, 'html.parser') rows = soup.findAll('td', {'class': 'tdMovie'}) - + # Search on first 10 rows only for row in rows[:10]: element = row.find('a', {'class': 'tooltip'}) @@ -125,37 +131,44 @@ class SubsUnacsProvider(Provider): link = element.get('href') logger.info('Found subtitle link %r', link) subtitles = subtitles + self.download_archive_and_add_subtitle_files('https://subsunacs.net' + link, language, video) - + return subtitles def list_subtitles(self, video, languages): return [s for l in languages for s in self.query(l, video)] def download_subtitle(self, subtitle): - pass - - def process_archive_subtitle_files(self, archiveStream, language, video): + if subtitle.content: + pass + else: + seeking_subtitle_file = subtitle.filename + arch = self.download_archive_and_add_subtitle_files(subtitle.page_link, subtitle.language, subtitle.video) + for s in arch: + if s.filename == seeking_subtitle_file: + subtitle.content = s.content + + def process_archive_subtitle_files(self, archiveStream, language, video, link): subtitles = [] type = 'episode' if isinstance(video, Episode) else 'movie' for file_name in archiveStream.namelist(): if file_name.lower().endswith(('.srt', '.sub')): logger.info('Found subtitle file %r', file_name) - subtitle = SubsUnacsSubtitle(language, file_name, type) + subtitle = SubsUnacsSubtitle(language, file_name, type, video, link) subtitle.content = archiveStream.read(file_name) subtitles.append(subtitle) return subtitles - + def download_archive_and_add_subtitle_files(self, link, language, video ): logger.info('Downloading subtitle %r', link) request = self.session.get(link, headers={ - 'Referer': 'https://subsunacs.net/search.php' + 'Referer': 'https://subsunacs.net/search.php' }) request.raise_for_status() archive_stream = io.BytesIO(request.content) if is_rarfile(archive_stream): - return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) + return self.process_archive_subtitle_files( RarFile(archive_stream), language, video, link ) elif is_zipfile(archive_stream): - return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) + return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video, link ) else: raise ValueError('Not a valid archive') diff --git a/libs/subliminal_patch/providers/titlovi.py b/libs/subliminal_patch/providers/titlovi.py index ec339fef8..860932ca5 100644 --- a/libs/subliminal_patch/providers/titlovi.py +++ b/libs/subliminal_patch/providers/titlovi.py @@ -4,6 +4,7 @@ import io import logging import math import re +import time import rarfile @@ -23,6 +24,7 @@ from subliminal.utils import sanitize_release_group from subliminal.subtitle import guess_matches from subliminal.video import Episode, Movie from subliminal.subtitle import fix_line_ending +from subliminal_patch.pitcher import pitchers, load_verification, store_verification from subzero.language import Language from random import randint @@ -142,6 +144,7 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): logger.debug('User-Agent set to %s', self.session.headers['User-Agent']) self.session.headers['Referer'] = self.server_url logger.debug('Referer set to %s', self.session.headers['Referer']) + load_verification("titlovi", self.session) def terminate(self): self.session.close() @@ -182,110 +185,144 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): r = self.session.get(self.search_url, params=params, timeout=10) r.raise_for_status() except RequestException as e: - logger.exception('RequestException %s', e) - break - - try: - soup = BeautifulSoup(r.content, 'lxml') - - # number of results - result_count = int(soup.select_one('.results_count b').string) - except: - result_count = None - - # exit if no results - if not result_count: - if not subtitles: - logger.debug('No subtitles found') - else: - logger.debug("No more subtitles found") - break - - # number of pages with results - pages = int(math.ceil(result_count / float(items_per_page))) - - # get current page - if 'pg' in params: - current_page = int(params['pg']) - - try: - sublist = soup.select('section.titlovi > ul.titlovi > li.subtitleContainer.canEdit') - for sub in sublist: - # subtitle id - sid = sub.find(attrs={'data-id': True}).attrs['data-id'] - # get download link - download_link = self.download_url + sid - # title and alternate title - match = title_re.search(sub.a.string) - if match: - _title = match.group('title') - alt_title = match.group('altitle') + captcha_passed = False + if e.response.status_code == 403 and "data-sitekey" in e.response.content: + logger.info('titlovi: Solving captcha. This might take a couple of minutes, but should only ' + 'happen once every so often') + + site_key = re.search(r'data-sitekey="(.+?)"', e.response.content).group(1) + challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', e.response.content).group(1) + challenge_ray = re.search(r'data-ray="(.+?)"', e.response.content).group(1) + if not all([site_key, challenge_s, challenge_ray]): + raise Exception("titlovi: Captcha site-key not found!") + + pitcher = pitchers.get_pitcher()("titlovi", e.request.url, site_key, + user_agent=self.session.headers["User-Agent"], + cookies=self.session.cookies.get_dict(), + is_invisible=True) + + result = pitcher.throw() + if not result: + raise Exception("titlovi: Couldn't solve captcha!") + + s_params = { + "s": challenge_s, + "id": challenge_ray, + "g-recaptcha-response": result, + } + r = self.session.get(self.server_url + "/cdn-cgi/l/chk_captcha", params=s_params, timeout=10, + allow_redirects=False) + r.raise_for_status() + r = self.session.get(self.search_url, params=params, timeout=10) + r.raise_for_status() + store_verification("titlovi", self.session) + captcha_passed = True + + if not captcha_passed: + logger.exception('RequestException %s', e) + break + else: + try: + soup = BeautifulSoup(r.content, 'lxml') + + # number of results + result_count = int(soup.select_one('.results_count b').string) + except: + result_count = None + + # exit if no results + if not result_count: + if not subtitles: + logger.debug('No subtitles found') else: - continue - - # page link - page_link = self.server_url + sub.a.attrs['href'] - # subtitle language - match = lang_re.search(sub.select_one('.lang').attrs['src']) - if match: - try: - # decode language - lang = Language.fromtitlovi(match.group('lang')+match.group('script')) - except ValueError: + logger.debug("No more subtitles found") + break + + # number of pages with results + pages = int(math.ceil(result_count / float(items_per_page))) + + # get current page + if 'pg' in params: + current_page = int(params['pg']) + + try: + sublist = soup.select('section.titlovi > ul.titlovi > li.subtitleContainer.canEdit') + for sub in sublist: + # subtitle id + sid = sub.find(attrs={'data-id': True}).attrs['data-id'] + # get download link + download_link = self.download_url + sid + # title and alternate title + match = title_re.search(sub.a.string) + if match: + _title = match.group('title') + alt_title = match.group('altitle') + else: continue - # relase year or series start year - match = year_re.search(sub.find(attrs={'data-id': True}).parent.i.string) - if match: - r_year = int(match.group('year')) - # fps - match = fps_re.search(sub.select_one('.fps').string) - if match: - fps = match.group('fps') - # releases - releases = str(sub.select_one('.fps').parent.contents[0].string) - - # handle movies and series separately - if is_episode: - # season and episode info - sxe = sub.select_one('.s0xe0y').string - r_season = None - r_episode = None - if sxe: - match = season_re.search(sxe) - if match: - r_season = int(match.group('season')) - match = episode_re.search(sxe) - if match: - r_episode = int(match.group('episode')) - - subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, - alt_title=alt_title, season=r_season, episode=r_episode, - year=r_year, fps=fps, - asked_for_release_group=video.release_group, - asked_for_episode=episode) - else: - subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, - alt_title=alt_title, year=r_year, fps=fps, - asked_for_release_group=video.release_group) - logger.debug('Found subtitle %r', subtitle) - - # prime our matches so we can use the values later - subtitle.get_matches(video) - - # add found subtitles - subtitles.append(subtitle) - - finally: - soup.decompose() - - # stop on last page - if current_page >= pages: - break - - # increment current page - params['pg'] = current_page + 1 - logger.debug('Getting page %d', params['pg']) + # page link + page_link = self.server_url + sub.a.attrs['href'] + # subtitle language + match = lang_re.search(sub.select_one('.lang').attrs['src']) + if match: + try: + # decode language + lang = Language.fromtitlovi(match.group('lang')+match.group('script')) + except ValueError: + continue + + # relase year or series start year + match = year_re.search(sub.find(attrs={'data-id': True}).parent.i.string) + if match: + r_year = int(match.group('year')) + # fps + match = fps_re.search(sub.select_one('.fps').string) + if match: + fps = match.group('fps') + # releases + releases = str(sub.select_one('.fps').parent.contents[0].string) + + # handle movies and series separately + if is_episode: + # season and episode info + sxe = sub.select_one('.s0xe0y').string + r_season = None + r_episode = None + if sxe: + match = season_re.search(sxe) + if match: + r_season = int(match.group('season')) + match = episode_re.search(sxe) + if match: + r_episode = int(match.group('episode')) + + subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, + alt_title=alt_title, season=r_season, episode=r_episode, + year=r_year, fps=fps, + asked_for_release_group=video.release_group, + asked_for_episode=episode) + else: + subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, + alt_title=alt_title, year=r_year, fps=fps, + asked_for_release_group=video.release_group) + logger.debug('Found subtitle %r', subtitle) + + # prime our matches so we can use the values later + subtitle.get_matches(video) + + # add found subtitles + subtitles.append(subtitle) + + finally: + soup.decompose() + + # stop on last page + if current_page >= pages: + break + + # increment current page + params['pg'] = current_page + 1 + logger.debug('Getting page %d', params['pg']) return subtitles diff --git a/libs/subliminal_patch/providers/zimuku.py b/libs/subliminal_patch/providers/zimuku.py new file mode 100644 index 000000000..5090816f8 --- /dev/null +++ b/libs/subliminal_patch/providers/zimuku.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +import io +import logging +import os +import zipfile + +import rarfile +from subzero.language import Language +from guessit import guessit +from requests import Session +from six import text_type + +from subliminal import __short_version__ +from subliminal.providers import ParserBeautifulSoup, Provider +from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches +from subliminal.video import Episode, Movie + +logger = logging.getLogger(__name__) + + +class ZimukuSubtitle(Subtitle): + """Zimuku Subtitle.""" + provider_name = 'zimuku' + + def __init__(self, language, page_link, version, download_link): + super(ZimukuSubtitle, self).__init__(language, page_link=page_link) + self.version = version + self.download_link = download_link + self.hearing_impaired = None + self.encoding = 'utf-8' + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + # episode + if isinstance(video, Episode): + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True) + # movie + elif isinstance(video, Movie): + # other properties + matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True) + + return matches + + +class ZimukuProvider(Provider): + """Zimuku Provider.""" + languages = {Language(l) for l in ['zho', 'eng']} + + server_url = 'http://www.zimuku.la' + search_url = '/search?q={}' + download_url = 'http://www.zimuku.la/' + + UserAgent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)' + + subtitle_class = ZimukuSubtitle + + def __init__(self): + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + def terminate(self): + self.session.close() + + def query(self, keyword, season=None, episode=None, year=None): + params = keyword + if season and episode: + params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) + elif year: + params += ' {:4d}'.format(year) + + logger.debug('Searching subtitles %r', params) + subtitles = [] + search_link = self.server_url + text_type(self.search_url).format(params) + + r = self.session.get(search_link, timeout=30) + r.raise_for_status() + + if not r.content: + logger.debug('No data returned from provider') + return [] + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) + + for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'): + moviename = entity.text + entity_url = self.server_url + entity['href'] + logger.debug(entity_url) + r = self.session.get(entity_url, timeout=30) + r.raise_for_status() + logger.debug('looking into ' + entity_url) + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("div", class_="subs box clearfix") + # loop over subtitles cells + + subs = soup.tbody.find_all("tr") + for sub in subs: + page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8')) + version = sub.a.text.encode('utf-8') or None + if version is None: + version = "" + try: + td = sub.find("td", class_="tac lang") + r2 = td.find_all("img") + langs = [x.get('title').encode('utf-8') for x in r2] + except: + langs = '未知' + name = '%s (%s)' % (version, ",".join(langs)) + + if ('English' in langs) and not(('简体中文' in langs) or ('繁體中文' in langs)): + language = Language('eng') + else: + language = Language('zho') + # read the item + subtitle = self.subtitle_class(language, page_link, version, page_link.replace("detail","dld")) + + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + if isinstance(video, Episode): + titles = [video.series] + video.alternative_series + elif isinstance(video, Movie): + titles = [video.title] + video.alternative_titles + else: + titles = [] + + subtitles = [] + # query for subtitles with the show_id + for title in titles: + if isinstance(video, Episode): + subtitles += [s for s in self.query(title, season=video.season, episode=video.episode, + year=video.year) + if s.language in languages] + elif isinstance(video, Movie): + subtitles += [s for s in self.query(title, year=video.year) + if s.language in languages] + + return subtitles + + def download_subtitle(self, subtitle): + if isinstance(subtitle, ZimukuSubtitle): + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, + timeout=30) + r.raise_for_status() + + if not r.content: + logger.debug('Unable to download subtitle. No data returned from provider') + return + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) + links = soup.find("div", {"class":"clearfix"}).find_all('a') + # TODO: add settings for choice + + for down_link in links: + url = down_link.get('href').encode('utf-8') + url = self.server_url + url + r = self.session.get(url, headers={'Referer': subtitle.download_link}, + timeout=30) + r.raise_for_status() + + if len(r.content) > 1024: + break + + archive_stream = io.BytesIO(r.content) + archive = None + if rarfile.is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive = rarfile.RarFile(archive_stream) + subtitle_content = _get_subtitle_from_archive(archive) + elif zipfile.is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive = zipfile.ZipFile(archive_stream) + subtitle_content = _get_subtitle_from_archive(archive) + else: + subtitle_content = r.content + + if subtitle_content: + subtitle.content = fix_line_ending(subtitle_content) + else: + logger.debug('Could not extract subtitle from %r', archive) + + +def _get_subtitle_from_archive(archive): + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + return archive.read(name) + + return None diff --git a/libs/subliminal_patch/refiners/omdb.py b/libs/subliminal_patch/refiners/omdb.py index 9ecb5155b..bef212f75 100644 --- a/libs/subliminal_patch/refiners/omdb.py +++ b/libs/subliminal_patch/refiners/omdb.py @@ -4,7 +4,7 @@ import subliminal import base64 import zlib from subliminal import __short_version__ -from subliminal.refiners.omdb import OMDBClient, refine +from subliminal.refiners.omdb import OMDBClient, refine as refine_orig, Episode, Movie class SZOMDBClient(OMDBClient): @@ -63,5 +63,13 @@ class SZOMDBClient(OMDBClient): return j +def refine(video, **kwargs): + refine_orig(video, **kwargs) + if isinstance(video, Episode) and video.series_imdb_id: + video.series_imdb_id = video.series_imdb_id.strip() + elif isinstance(video, Movie) and video.imdb_id: + video.imdb_id = video.imdb_id.strip() + + omdb_client = SZOMDBClient(headers={'User-Agent': 'Subliminal/%s' % __short_version__}) subliminal.refiners.omdb.omdb_client = omdb_client diff --git a/libs/subliminal_patch/subtitle.py b/libs/subliminal_patch/subtitle.py index 9a165fe4b..69a3c1e5b 100644 --- a/libs/subliminal_patch/subtitle.py +++ b/libs/subliminal_patch/subtitle.py @@ -38,6 +38,8 @@ class Subtitle(Subtitle_): plex_media_fps = None skip_wrong_fps = False wrong_fps = False + wrong_series = False + wrong_season_ep = False is_pack = False asked_for_release_group = None asked_for_episode = None @@ -356,7 +358,8 @@ def guess_matches(video, guess, partial=False): matches = set() if isinstance(video, Episode): # series - if video.series and 'title' in guess and sanitize(guess['title']) == sanitize(video.series): + if video.series and 'title' in guess and sanitize(guess['title']) in ( + sanitize(name) for name in [video.series] + video.alternative_series): matches.add('series') # title if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title): @@ -384,7 +387,8 @@ def guess_matches(video, guess, partial=False): if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') # title - if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title): + if video.title and 'title' in guess and sanitize(guess['title']) in ( + sanitize(name) for name in [video.title] + video.alternative_titles): matches.add('title') # release_group diff --git a/views/settings.tpl b/views/settings.tpl index 29072aeff..430cfc253 100644 --- a/views/settings.tpl +++ b/views/settings.tpl @@ -1228,12 +1228,104 @@
+ +
Anti-captcha options
+
+
+
+
+ +
+
+ +
+ + +
+ +
+
+
+ +
+ +
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ +
+ +
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+
+
+
Subtitles providers
- +
@@ -1703,7 +1795,7 @@
- +
@@ -1772,6 +1864,28 @@
+
+
+ +
+
+
+ + +
+
+ +
+
+ +
+