commit
d49a04ce15
@ -0,0 +1,279 @@
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
import re
|
||||
|
||||
# based off of https://gist.github.com/doko-desuka/58d9212461f62583f8df9bc6387fade2
|
||||
# and https://github.com/Anorov/cloudflare-scrape
|
||||
# and https://github.com/VeNoMouS/cloudflare-scrape-js2py
|
||||
|
||||
'''''''''
|
||||
Disables InsecureRequestWarning: Unverified HTTPS request is being made warnings.
|
||||
'''''''''
|
||||
import requests
|
||||
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
||||
|
||||
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
||||
''''''
|
||||
from requests.sessions import Session
|
||||
from copy import deepcopy
|
||||
|
||||
try:
|
||||
from urlparse import urlparse
|
||||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
DEFAULT_USER_AGENTS = [
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36",
|
||||
"Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0",
|
||||
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
|
||||
]
|
||||
|
||||
DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
|
||||
|
||||
BUG_REPORT = (
|
||||
"Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a "
|
||||
"bug report at https://github.com/Anorov/cloudflare-scrape/issues.")
|
||||
|
||||
|
||||
class CloudflareScraper(Session):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(CloudflareScraper, self).__init__(*args, **kwargs)
|
||||
|
||||
if "requests" in self.headers["User-Agent"]:
|
||||
# Spoof Firefox on Linux if no custom User-Agent has been set
|
||||
self.headers["User-Agent"] = random.choice(DEFAULT_USER_AGENTS)
|
||||
|
||||
def request(self, method, url, *args, **kwargs):
|
||||
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
|
||||
|
||||
# Check if Cloudflare anti-bot is on
|
||||
if (resp.status_code in (503, 429)
|
||||
and resp.headers.get("Server", "").startswith("cloudflare")
|
||||
and b"jschl_vc" in resp.content
|
||||
and b"jschl_answer" in resp.content
|
||||
):
|
||||
return self.solve_cf_challenge(resp, **kwargs)
|
||||
|
||||
# Otherwise, no Cloudflare anti-bot detected
|
||||
return resp
|
||||
|
||||
def solve_cf_challenge(self, resp, **original_kwargs):
|
||||
body = resp.text
|
||||
parsed_url = urlparse(resp.url)
|
||||
domain = parsed_url.netloc
|
||||
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
|
||||
|
||||
cloudflare_kwargs = deepcopy(original_kwargs)
|
||||
params = cloudflare_kwargs.setdefault("params", {})
|
||||
headers = cloudflare_kwargs.setdefault("headers", {})
|
||||
headers["Referer"] = resp.url
|
||||
|
||||
try:
|
||||
cf_delay = float(re.search('submit.*?(\d+)', body, re.DOTALL).group(1)) / 1000.0
|
||||
|
||||
form_index = body.find('id="challenge-form"')
|
||||
if form_index == -1:
|
||||
raise Exception('CF form not found')
|
||||
sub_body = body[form_index:]
|
||||
|
||||
s_match = re.search('name="s" value="(.+?)"', sub_body)
|
||||
if s_match:
|
||||
params["s"] = s_match.group(1) # On older variants this parameter is absent.
|
||||
params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', sub_body).group(1)
|
||||
params["pass"] = re.search(r'name="pass" value="(.+?)"', sub_body).group(1)
|
||||
|
||||
if body.find('id="cf-dn-', form_index) != -1:
|
||||
extra_div_expression = re.search('id="cf-dn-.*?>(.+?)<', sub_body).group(1)
|
||||
|
||||
# Initial value.
|
||||
js_answer = self.cf_parse_expression(
|
||||
re.search('setTimeout\(function\(.*?:(.*?)}', body, re.DOTALL).group(1)
|
||||
)
|
||||
# Extract the arithmetic operations.
|
||||
builder = re.search("challenge-form'\);\s*;(.*);a.value", body, re.DOTALL).group(1)
|
||||
# Remove a function semicolon before splitting on semicolons, else it messes the order.
|
||||
lines = builder.replace(' return +(p)}();', '', 1).split(';')
|
||||
|
||||
for line in lines:
|
||||
if len(line) and '=' in line:
|
||||
heading, expression = line.split('=', 1)
|
||||
if 'eval(eval(atob' in expression:
|
||||
# Uses the expression in an external <div>.
|
||||
expression_value = self.cf_parse_expression(extra_div_expression)
|
||||
elif '(function(p' in expression:
|
||||
# Expression + domain sampling function.
|
||||
expression_value = self.cf_parse_expression(expression, domain)
|
||||
else:
|
||||
expression_value = self.cf_parse_expression(expression)
|
||||
js_answer = self.cf_arithmetic_op(heading[-1], js_answer, expression_value)
|
||||
|
||||
if '+ t.length' in body:
|
||||
js_answer += len(domain) # Only older variants add the domain length.
|
||||
|
||||
params["jschl_answer"] = '%.10f' % js_answer
|
||||
|
||||
except Exception as e:
|
||||
# Something is wrong with the page.
|
||||
# This may indicate Cloudflare has changed their anti-bot
|
||||
# technique. If you see this and are running the latest version,
|
||||
# please open a GitHub issue so I can update the code accordingly.
|
||||
logging.error("[!] %s Unable to parse Cloudflare anti-bots page. "
|
||||
"Try upgrading cloudflare-scrape, or submit a bug report "
|
||||
"if you are running the latest version. Please read "
|
||||
"https://github.com/Anorov/cloudflare-scrape#updates "
|
||||
"before submitting a bug report." % e)
|
||||
raise
|
||||
|
||||
# Cloudflare requires a delay before solving the challenge.
|
||||
# Always wait the full delay + 1s because of 'time.sleep()' imprecision.
|
||||
time.sleep(cf_delay + 1.0)
|
||||
|
||||
# Requests transforms any request into a GET after a redirect,
|
||||
# so the redirect has to be handled manually here to allow for
|
||||
# performing other types of requests even as the first request.
|
||||
method = resp.request.method
|
||||
cloudflare_kwargs["allow_redirects"] = False
|
||||
|
||||
redirect = self.request(method, submit_url, **cloudflare_kwargs)
|
||||
|
||||
if 'Location' in redirect.headers:
|
||||
redirect_location = urlparse(redirect.headers["Location"])
|
||||
if not redirect_location.netloc:
|
||||
redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path)
|
||||
return self.request(method, redirect_url, **original_kwargs)
|
||||
return self.request(method, redirect.headers["Location"], **original_kwargs)
|
||||
else:
|
||||
return redirect
|
||||
|
||||
def cf_sample_domain_function(self, func_expression, domain):
|
||||
parameter_start_index = func_expression.find('}(') + 2
|
||||
# Send the expression with the "+" char and enclosing parenthesis included, as they are
|
||||
# stripped inside ".cf_parse_expression()'.
|
||||
sample_index = self.cf_parse_expression(
|
||||
func_expression[parameter_start_index: func_expression.rfind(')))')]
|
||||
)
|
||||
return ord(domain[int(sample_index)])
|
||||
|
||||
def cf_arithmetic_op(self, op, a, b):
|
||||
if op == '+':
|
||||
return a + b
|
||||
elif op == '/':
|
||||
return a / float(b)
|
||||
elif op == '*':
|
||||
return a * float(b)
|
||||
elif op == '-':
|
||||
return a - b
|
||||
else:
|
||||
raise Exception('Unknown operation')
|
||||
|
||||
def cf_parse_expression(self, expression, domain=None):
|
||||
|
||||
def _get_jsfuck_number(section):
|
||||
digit_expressions = section.replace('!+[]', '1').replace('+!![]', '1').replace('+[]', '0').split('+')
|
||||
return int(
|
||||
# Form a number string, with each digit as the sum of the values inside each parenthesis block.
|
||||
''.join(
|
||||
str(sum(int(digit_char) for digit_char in digit_expression[1:-1])) # Strip the parenthesis.
|
||||
for digit_expression in digit_expressions
|
||||
)
|
||||
)
|
||||
|
||||
if '/' in expression:
|
||||
dividend, divisor = expression.split('/')
|
||||
dividend = dividend[2:-1] # Strip the leading '+' char and the enclosing parenthesis.
|
||||
|
||||
if domain:
|
||||
# 2019-04-02: At this moment, this extra domain sampling function always appears on the
|
||||
# divisor side, at the end.
|
||||
divisor_a, divisor_b = divisor.split('))+(')
|
||||
divisor_a = _get_jsfuck_number(divisor_a[5:]) # Left-strip the sequence of "(+(+(".
|
||||
divisor_b = self.cf_sample_domain_function(divisor_b, domain)
|
||||
return _get_jsfuck_number(dividend) / float(divisor_a + divisor_b)
|
||||
else:
|
||||
divisor = divisor[2:-1]
|
||||
return _get_jsfuck_number(dividend) / float(_get_jsfuck_number(divisor))
|
||||
else:
|
||||
return _get_jsfuck_number(expression[2:-1])
|
||||
|
||||
@classmethod
|
||||
def create_scraper(cls, sess=None, **kwargs):
|
||||
"""
|
||||
Convenience function for creating a ready-to-go requests.Session (subclass) object.
|
||||
"""
|
||||
scraper = cls()
|
||||
|
||||
if sess:
|
||||
attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
|
||||
for attr in attrs:
|
||||
val = getattr(sess, attr, None)
|
||||
if val:
|
||||
setattr(scraper, attr, val)
|
||||
|
||||
return scraper
|
||||
|
||||
## Functions for integrating cloudflare-scrape with other applications and scripts
|
||||
|
||||
@classmethod
|
||||
def get_tokens(cls, url, user_agent=None, **kwargs):
|
||||
scraper = cls.create_scraper()
|
||||
if user_agent:
|
||||
scraper.headers["User-Agent"] = user_agent
|
||||
|
||||
try:
|
||||
resp = scraper.get(url, **kwargs)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logging.error("'%s' returned an error. Could not collect tokens." % url)
|
||||
raise
|
||||
|
||||
domain = urlparse(resp.url).netloc
|
||||
cookie_domain = None
|
||||
|
||||
for d in scraper.cookies.list_domains():
|
||||
if d.startswith(".") and d in ("." + domain):
|
||||
cookie_domain = d
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
|
||||
|
||||
return ({
|
||||
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
|
||||
"cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain)
|
||||
},
|
||||
scraper.headers["User-Agent"]
|
||||
)
|
||||
|
||||
def get_live_tokens(self, domain):
|
||||
for d in self.cookies.list_domains():
|
||||
if d.startswith(".") and d in ("." + domain):
|
||||
cookie_domain = d
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
|
||||
|
||||
return ({
|
||||
"__cfduid": self.cookies.get("__cfduid", "", domain=cookie_domain),
|
||||
"cf_clearance": self.cookies.get("cf_clearance", "", domain=cookie_domain)
|
||||
},
|
||||
self.headers["User-Agent"]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_cookie_string(cls, url, user_agent=None, **kwargs):
|
||||
"""
|
||||
Convenience function for building a Cookie HTTP header value.
|
||||
"""
|
||||
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
|
||||
return "; ".join("=".join(pair) for pair in tokens.items()), user_agent
|
||||
|
||||
|
||||
create_scraper = CloudflareScraper.create_scraper
|
||||
get_tokens = CloudflareScraper.get_tokens
|
||||
get_cookie_string = CloudflareScraper.get_cookie_string
|
@ -0,0 +1,516 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Death by Captcha HTTP and socket API clients.
|
||||
|
||||
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
|
||||
socket ones. Both offer the same functionalily, with the socket API
|
||||
sporting faster responses and using way less connections.
|
||||
|
||||
To access the socket API, use SocketClient class; for the HTTP API, use
|
||||
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
|
||||
connection opened and serializes all API requests sent through it, thus
|
||||
it is advised to keep a pool of them if you're script is heavily
|
||||
multithreaded.
|
||||
|
||||
Both SocketClient and HttpClient give you the following methods:
|
||||
|
||||
get_user()
|
||||
Returns your DBC account details as a dict with the following keys:
|
||||
|
||||
"user": your account numeric ID; if login fails, it will be the only
|
||||
item with the value of 0;
|
||||
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
|
||||
solved CAPTCHA in US cents;
|
||||
"balance": your DBC account balance in US cents;
|
||||
"is_banned": flag indicating whether your account is suspended or not.
|
||||
|
||||
get_balance()
|
||||
Returns your DBC account balance in US cents.
|
||||
|
||||
get_captcha(cid)
|
||||
Returns an uploaded CAPTCHA details as a dict with the following keys:
|
||||
|
||||
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
|
||||
be the only item with the value of 0;
|
||||
"text": the CAPTCHA text, if solved, otherwise None;
|
||||
"is_correct": flag indicating whether the CAPTCHA was solved correctly
|
||||
(DBC can detect that in rare cases).
|
||||
|
||||
The only argument `cid` is the CAPTCHA numeric ID.
|
||||
|
||||
get_text(cid)
|
||||
Returns an uploaded CAPTCHA text (None if not solved). The only argument
|
||||
`cid` is the CAPTCHA numeric ID.
|
||||
|
||||
report(cid)
|
||||
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
|
||||
CAPTCHA numeric ID. Returns True on success, False otherwise.
|
||||
|
||||
upload(captcha)
|
||||
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
|
||||
object (any object with `read` method defined, actually, so StringIO
|
||||
will do), or CAPTCHA image file name. On successul upload you'll get
|
||||
the CAPTCHA details dict (see get_captcha() method).
|
||||
|
||||
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
|
||||
to poll for its status periodically using get_captcha() or get_text()
|
||||
method until the CAPTCHA is solved and you get the text.
|
||||
|
||||
decode(captcha, timeout=DEFAULT_TIMEOUT)
|
||||
A convenient method that uploads a CAPTCHA and polls for its status
|
||||
periodically, but no longer than `timeout` (defaults to 60 seconds).
|
||||
If solved, you'll get the CAPTCHA details dict (see get_captcha()
|
||||
method for details). See upload() method for details on `captcha`
|
||||
argument.
|
||||
|
||||
Visit http://www.deathbycaptcha.com/user/api for updates.
|
||||
|
||||
"""
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import errno
|
||||
import imghdr
|
||||
import random
|
||||
import os
|
||||
import select
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
try:
|
||||
from json import read as json_decode, write as json_encode
|
||||
except ImportError:
|
||||
try:
|
||||
from json import loads as json_decode, dumps as json_encode
|
||||
except ImportError:
|
||||
from simplejson import loads as json_decode, dumps as json_encode
|
||||
|
||||
|
||||
# API version and unique software ID
|
||||
API_VERSION = 'DBC/Python v4.6'
|
||||
|
||||
# Default CAPTCHA timeout and decode() polling interval
|
||||
DEFAULT_TIMEOUT = 60
|
||||
DEFAULT_TOKEN_TIMEOUT = 120
|
||||
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
|
||||
DFLT_POLL_INTERVAL = 3
|
||||
|
||||
# Base HTTP API url
|
||||
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
|
||||
|
||||
# Preferred HTTP API server's response content type, do not change
|
||||
HTTP_RESPONSE_TYPE = 'application/json'
|
||||
|
||||
# Socket API server's host & ports range
|
||||
SOCKET_HOST = 'api.dbcapi.me'
|
||||
SOCKET_PORTS = range(8123, 8131)
|
||||
|
||||
|
||||
def _load_image(captcha):
|
||||
if hasattr(captcha, 'read'):
|
||||
img = captcha.read()
|
||||
elif type(captcha) == bytearray:
|
||||
img = captcha
|
||||
else:
|
||||
img = ''
|
||||
try:
|
||||
captcha_file = open(captcha, 'rb')
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
img = captcha_file.read()
|
||||
captcha_file.close()
|
||||
if not len(img):
|
||||
raise ValueError('CAPTCHA image is empty')
|
||||
elif imghdr.what(None, img) is None:
|
||||
raise TypeError('Unknown CAPTCHA image type')
|
||||
else:
|
||||
return img
|
||||
|
||||
|
||||
class AccessDeniedException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Client(object):
|
||||
|
||||
"""Death by Captcha API Client."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.is_verbose = False
|
||||
self.userpwd = {'username': username, 'password': password}
|
||||
|
||||
def _log(self, cmd, msg=''):
|
||||
if self.is_verbose:
|
||||
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def connect(self):
|
||||
pass
|
||||
|
||||
def get_user(self):
|
||||
"""Fetch user details -- ID, balance, rate and banned status."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_balance(self):
|
||||
"""Fetch user balance (in US cents)."""
|
||||
return self.get_user().get('balance')
|
||||
|
||||
def get_captcha(self, cid):
|
||||
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_text(self, cid):
|
||||
"""Fetch a CAPTCHA text."""
|
||||
return self.get_captcha(cid).get('text') or None
|
||||
|
||||
def report(self, cid):
|
||||
"""Report a CAPTCHA as incorrectly solved."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def upload(self, captcha):
|
||||
"""Upload a CAPTCHA.
|
||||
|
||||
Accepts file names and file-like objects. Returns CAPTCHA details
|
||||
dict on success.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def decode(self, captcha=None, timeout=None, **kwargs):
|
||||
"""
|
||||
Try to solve a CAPTCHA.
|
||||
|
||||
See Client.upload() for arguments details.
|
||||
|
||||
Uploads a CAPTCHA, polls for its status periodically with arbitrary
|
||||
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
|
||||
"""
|
||||
if not timeout:
|
||||
if not captcha:
|
||||
timeout = DEFAULT_TOKEN_TIMEOUT
|
||||
else:
|
||||
timeout = DEFAULT_TIMEOUT
|
||||
|
||||
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
|
||||
uploaded_captcha = self.upload(captcha, **kwargs)
|
||||
if uploaded_captcha:
|
||||
intvl_idx = 0 # POLL_INTERVAL index
|
||||
while deadline > time.time() and not uploaded_captcha.get('text'):
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
time.sleep(intvl)
|
||||
pulled = self.get_captcha(uploaded_captcha['captcha'])
|
||||
if pulled['captcha'] == uploaded_captcha['captcha']:
|
||||
uploaded_captcha = pulled
|
||||
if uploaded_captcha.get('text') and \
|
||||
uploaded_captcha.get('is_correct'):
|
||||
return uploaded_captcha
|
||||
|
||||
def _get_poll_interval(self, idx):
|
||||
"""Returns poll interval and next index depending on index provided"""
|
||||
|
||||
if len(POLLS_INTERVAL) > idx:
|
||||
intvl = POLLS_INTERVAL[idx]
|
||||
else:
|
||||
intvl = DFLT_POLL_INTERVAL
|
||||
idx += 1
|
||||
|
||||
return intvl, idx
|
||||
|
||||
|
||||
class HttpClient(Client):
|
||||
|
||||
"""Death by Captcha HTTP API client."""
|
||||
|
||||
def __init__(self, *args):
|
||||
Client.__init__(self, *args)
|
||||
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
|
||||
|
||||
def _call(self, cmd, payload=None, headers=None):
|
||||
if headers is None:
|
||||
headers = {}
|
||||
headers['Accept'] = HTTP_RESPONSE_TYPE
|
||||
headers['User-Agent'] = API_VERSION
|
||||
if hasattr(payload, 'items'):
|
||||
payload = urllib.urlencode(payload)
|
||||
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
|
||||
else:
|
||||
self._log('SEND', '%s' % cmd)
|
||||
if payload is not None:
|
||||
headers['Content-Length'] = len(payload)
|
||||
try:
|
||||
response = self.opener.open(urllib2.Request(
|
||||
HTTP_BASE_URL + '/' + cmd.strip('/'),
|
||||
data=payload,
|
||||
headers=headers
|
||||
)).read()
|
||||
except urllib2.HTTPError, err:
|
||||
if 403 == err.code:
|
||||
raise AccessDeniedException('Access denied, please check'
|
||||
' your credentials and/or balance')
|
||||
elif 400 == err.code or 413 == err.code:
|
||||
raise ValueError("CAPTCHA was rejected by the service, check"
|
||||
" if it's a valid image")
|
||||
elif 503 == err.code:
|
||||
raise OverflowError("CAPTCHA was rejected due to service"
|
||||
" overload, try again later")
|
||||
else:
|
||||
raise err
|
||||
else:
|
||||
self._log('RECV', '%d %s' % (len(response), response))
|
||||
try:
|
||||
return json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
return {}
|
||||
|
||||
def get_user(self):
|
||||
return self._call('user', self.userpwd.copy()) or {'user': 0}
|
||||
|
||||
def get_captcha(self, cid):
|
||||
return self._call('captcha/%d' % cid) or {'captcha': 0}
|
||||
|
||||
def report(self, cid):
|
||||
return not self._call('captcha/%d/report' % cid,
|
||||
self.userpwd.copy()).get('is_correct')
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
boundary = binascii.hexlify(os.urandom(16))
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
|
||||
body = '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in self.userpwd.items())
|
||||
|
||||
body += '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in kwargs.items())
|
||||
|
||||
if captcha:
|
||||
img = _load_image(captcha)
|
||||
body += '\r\n'.join((
|
||||
'',
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="captchafile"; '
|
||||
'filename="captcha"',
|
||||
'Content-Type: application/octet-stream',
|
||||
'Content-Length: %d' % len(img),
|
||||
'',
|
||||
img,
|
||||
'--%s--' % boundary,
|
||||
''
|
||||
))
|
||||
|
||||
response = self._call('captcha', body, {
|
||||
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
|
||||
}) or {}
|
||||
if response.get('captcha'):
|
||||
return response
|
||||
|
||||
|
||||
class SocketClient(Client):
|
||||
|
||||
"""Death by Captcha socket API client."""
|
||||
|
||||
TERMINATOR = '\r\n'
|
||||
|
||||
def __init__(self, *args):
|
||||
Client.__init__(self, *args)
|
||||
self.socket_lock = threading.Lock()
|
||||
self.socket = None
|
||||
|
||||
def close(self):
|
||||
if self.socket:
|
||||
self._log('CLOSE')
|
||||
try:
|
||||
self.socket.shutdown(socket.SHUT_RDWR)
|
||||
except socket.error:
|
||||
pass
|
||||
finally:
|
||||
self.socket.close()
|
||||
self.socket = None
|
||||
|
||||
def connect(self):
|
||||
if not self.socket:
|
||||
self._log('CONN')
|
||||
host = (socket.gethostbyname(SOCKET_HOST),
|
||||
random.choice(SOCKET_PORTS))
|
||||
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
self.socket.settimeout(0)
|
||||
try:
|
||||
self.socket.connect(host)
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
self.close()
|
||||
raise err
|
||||
return self.socket
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
def _sendrecv(self, sock, buf):
|
||||
self._log('SEND', buf)
|
||||
fds = [sock]
|
||||
buf += self.TERMINATOR
|
||||
response = ''
|
||||
intvl_idx = 0
|
||||
while True:
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
rds, wrs, exs = select.select((not buf and fds) or [],
|
||||
(buf and fds) or [],
|
||||
fds,
|
||||
intvl)
|
||||
if exs:
|
||||
raise IOError('select() failed')
|
||||
try:
|
||||
if wrs:
|
||||
while buf:
|
||||
buf = buf[wrs[0].send(buf):]
|
||||
elif rds:
|
||||
while True:
|
||||
s = rds[0].recv(256)
|
||||
if not s:
|
||||
raise IOError('recv(): connection lost')
|
||||
else:
|
||||
response += s
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
raise err
|
||||
if response.endswith(self.TERMINATOR):
|
||||
self._log('RECV', response)
|
||||
return response.rstrip(self.TERMINATOR)
|
||||
raise IOError('send/recv timed out')
|
||||
|
||||
def _call(self, cmd, data=None):
|
||||
if data is None:
|
||||
data = {}
|
||||
data['cmd'] = cmd
|
||||
data['version'] = API_VERSION
|
||||
request = json_encode(data)
|
||||
|
||||
response = None
|
||||
for _ in range(2):
|
||||
if not self.socket and cmd != 'login':
|
||||
self._call('login', self.userpwd.copy())
|
||||
self.socket_lock.acquire()
|
||||
try:
|
||||
sock = self.connect()
|
||||
response = self._sendrecv(sock, request)
|
||||
except IOError, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
self.close()
|
||||
except socket.error, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
self.close()
|
||||
raise IOError('Connection refused')
|
||||
else:
|
||||
break
|
||||
finally:
|
||||
self.socket_lock.release()
|
||||
|
||||
if response is None:
|
||||
raise IOError('Connection lost or timed out during API request')
|
||||
|
||||
try:
|
||||
response = json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
|
||||
if not response.get('error'):
|
||||
return response
|
||||
|
||||
error = response['error']
|
||||
if error in ('not-logged-in', 'invalid-credentials'):
|
||||
raise AccessDeniedException('Access denied, check your credentials')
|
||||
elif 'banned' == error:
|
||||
raise AccessDeniedException('Access denied, account is suspended')
|
||||
elif 'insufficient-funds' == error:
|
||||
raise AccessDeniedException(
|
||||
'CAPTCHA was rejected due to low balance')
|
||||
elif 'invalid-captcha' == error:
|
||||
raise ValueError('CAPTCHA is not a valid image')
|
||||
elif 'service-overload' == error:
|
||||
raise OverflowError(
|
||||
'CAPTCHA was rejected due to service overload, try again later')
|
||||
else:
|
||||
self.socket_lock.acquire()
|
||||
self.close()
|
||||
self.socket_lock.release()
|
||||
raise RuntimeError('API server error occured: %s' % error)
|
||||
|
||||
def get_user(self):
|
||||
return self._call('user') or {'user': 0}
|
||||
|
||||
def get_captcha(self, cid):
|
||||
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
data = {}
|
||||
if captcha:
|
||||
data['captcha'] = base64.b64encode(_load_image(captcha))
|
||||
if kwargs:
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = base64.b64encode(_load_image(banner))
|
||||
data.update(kwargs)
|
||||
response = self._call('upload', data)
|
||||
if response.get('captcha'):
|
||||
uploaded_captcha = dict(
|
||||
(k, response.get(k))
|
||||
for k in ('captcha', 'text', 'is_correct')
|
||||
)
|
||||
if not uploaded_captcha['text']:
|
||||
uploaded_captcha['text'] = None
|
||||
return uploaded_captcha
|
||||
|
||||
def report(self, cid):
|
||||
return not self._call('report', {'captcha': cid}).get('is_correct')
|
||||
|
||||
|
||||
if '__main__' == __name__:
|
||||
# Put your DBC username & password here:
|
||||
# client = HttpClient(sys.argv[1], sys.argv[2])
|
||||
client = SocketClient(sys.argv[1], sys.argv[2])
|
||||
client.is_verbose = True
|
||||
|
||||
print 'Your balance is %s US cents' % client.get_balance()
|
||||
|
||||
for fn in sys.argv[3:]:
|
||||
try:
|
||||
# Put your CAPTCHA image file name or file-like object, and optional
|
||||
# solving timeout (in seconds) here:
|
||||
captcha = client.decode(fn, DEFAULT_TIMEOUT)
|
||||
except Exception, e:
|
||||
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
|
||||
captcha = None
|
||||
|
||||
if captcha:
|
||||
print 'CAPTCHA %d solved: %s' % \
|
||||
(captcha['captcha'], captcha['text'])
|
||||
|
||||
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
|
||||
# in fact incorrectly solved!
|
||||
# try:
|
||||
# client.report(captcha['captcha'])
|
||||
# except Exception, e:
|
||||
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
@ -0,0 +1,7 @@
|
||||
from .base import AnticaptchaClient
|
||||
from .tasks import NoCaptchaTask, NoCaptchaTaskProxylessTask, ImageToTextTask, FunCaptchaTask
|
||||
from .proxy import Proxy
|
||||
from .exceptions import AnticaptchaException
|
||||
from .fields import SimpleText, Image, WebLink, TextInput, Textarea, Checkbox, Select, Radio, ImageUpload
|
||||
|
||||
AnticatpchaException = AnticaptchaException
|
@ -0,0 +1,114 @@
|
||||
import requests
|
||||
import time
|
||||
|
||||
from six.moves.urllib_parse import urljoin
|
||||
from .exceptions import AnticaptchaException
|
||||
|
||||
SLEEP_EVERY_CHECK_FINISHED = 3
|
||||
MAXIMUM_JOIN_TIME = 60 * 5
|
||||
|
||||
|
||||
class Job(object):
|
||||
client = None
|
||||
task_id = None
|
||||
_last_result = None
|
||||
|
||||
def __init__(self, client, task_id):
|
||||
self.client = client
|
||||
self.task_id = task_id
|
||||
|
||||
def _update(self):
|
||||
self._last_result = self.client.getTaskResult(self.task_id)
|
||||
|
||||
def check_is_ready(self):
|
||||
self._update()
|
||||
return self._last_result['status'] == 'ready'
|
||||
|
||||
def get_solution_response(self): # Recaptcha
|
||||
return self._last_result['solution']['gRecaptchaResponse']
|
||||
|
||||
def get_token_response(self): # Funcaptcha
|
||||
return self._last_result['solution']['token']
|
||||
|
||||
def get_answers(self):
|
||||
return self._last_result['solution']['answers']
|
||||
|
||||
def get_captcha_text(self): # Image
|
||||
return self._last_result['solution']['text']
|
||||
|
||||
def report_incorrect(self):
|
||||
return self.client.reportIncorrectImage(self.task_id)
|
||||
|
||||
def join(self, maximum_time=None):
|
||||
elapsed_time = 0
|
||||
maximum_time = maximum_time or MAXIMUM_JOIN_TIME
|
||||
while not self.check_is_ready():
|
||||
time.sleep(SLEEP_EVERY_CHECK_FINISHED)
|
||||
elapsed_time += SLEEP_EVERY_CHECK_FINISHED
|
||||
if elapsed_time is not None and elapsed_time > maximum_time:
|
||||
raise AnticaptchaException(None, 250,
|
||||
"The execution time exceeded a maximum time of {} seconds. It takes {} seconds.".format(
|
||||
maximum_time, elapsed_time))
|
||||
|
||||
|
||||
class AnticaptchaClient(object):
|
||||
client_key = None
|
||||
CREATE_TASK_URL = "/createTask"
|
||||
TASK_RESULT_URL = "/getTaskResult"
|
||||
BALANCE_URL = "/getBalance"
|
||||
REPORT_IMAGE_URL = "/reportIncorrectImageCaptcha"
|
||||
SOFT_ID = 847
|
||||
language_pool = "en"
|
||||
|
||||
def __init__(self, client_key, language_pool="en", host="api.anti-captcha.com", use_ssl=True):
|
||||
self.client_key = client_key
|
||||
self.language_pool = language_pool
|
||||
self.base_url = "{proto}://{host}/".format(proto="https" if use_ssl else "http",
|
||||
host=host)
|
||||
self.session = requests.Session()
|
||||
|
||||
@property
|
||||
def client_ip(self):
|
||||
if not hasattr(self, '_client_ip'):
|
||||
self._client_ip = self.session.get('http://httpbin.org/ip').json()['origin']
|
||||
return self._client_ip
|
||||
|
||||
def _check_response(self, response):
|
||||
if response.get('errorId', False) == 11:
|
||||
response['errorDescription'] = "{} Your missing IP address is {}.".format(response['errorDescription'],
|
||||
self.client_ip)
|
||||
if response.get('errorId', False):
|
||||
raise AnticaptchaException(response['errorId'],
|
||||
response['errorCode'],
|
||||
response['errorDescription'])
|
||||
|
||||
def createTask(self, task):
|
||||
request = {"clientKey": self.client_key,
|
||||
"task": task.serialize(),
|
||||
"softId": self.SOFT_ID,
|
||||
"languagePool": self.language_pool,
|
||||
}
|
||||
response = self.session.post(urljoin(self.base_url, self.CREATE_TASK_URL), json=request).json()
|
||||
self._check_response(response)
|
||||
return Job(self, response['taskId'])
|
||||
|
||||
def getTaskResult(self, task_id):
|
||||
request = {"clientKey": self.client_key,
|
||||
"taskId": task_id}
|
||||
response = self.session.post(urljoin(self.base_url, self.TASK_RESULT_URL), json=request).json()
|
||||
self._check_response(response)
|
||||
return response
|
||||
|
||||
def getBalance(self):
|
||||
request = {"clientKey": self.client_key}
|
||||
response = self.session.post(urljoin(self.base_url, self.BALANCE_URL), json=request).json()
|
||||
self._check_response(response)
|
||||
return response['balance']
|
||||
|
||||
def reportIncorrectImage(self, task_id):
|
||||
request = {"clientKey": self.client_key,
|
||||
"taskId": task_id
|
||||
}
|
||||
response = self.session.post(urljoin(self.base_url, self.REPORT_IMAGE_URL), json=request).json()
|
||||
self._check_response(response)
|
||||
return response.get('status', False) != False
|
@ -0,0 +1,23 @@
|
||||
class AnticaptchaException(Exception):
|
||||
def __init__(self, error_id, error_code, error_description, *args):
|
||||
super(AnticaptchaException, self).__init__("[{}:{}]{}".format(error_code, error_id, error_description))
|
||||
self.error_description = error_description
|
||||
self.error_id = error_id
|
||||
self.error_code = error_code
|
||||
|
||||
|
||||
AnticatpchaException = AnticaptchaException
|
||||
|
||||
|
||||
class InvalidWidthException(AnticaptchaException):
|
||||
def __init__(self, width):
|
||||
self.width = width
|
||||
msg = 'Invalid width (%s). Can be one of these: 100, 50, 33, 25.' % (self.width,)
|
||||
super(InvalidWidthException, self).__init__("AC-1", 1, msg)
|
||||
|
||||
|
||||
class MissingNameException(AnticaptchaException):
|
||||
def __init__(self, cls):
|
||||
self.cls = cls
|
||||
msg = 'Missing name data in {0}. Provide {0}.__init__(name="X") or {0}.serialize(name="X")'.format(str(self.cls))
|
||||
super(MissingNameException, self).__init__("AC-2", 2, msg)
|
@ -0,0 +1,199 @@
|
||||
import six
|
||||
from python_anticaptcha.exceptions import InvalidWidthException, MissingNameException
|
||||
|
||||
|
||||
class BaseField(object):
|
||||
label = None
|
||||
labelHint = None
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = {}
|
||||
if self.label:
|
||||
data['label'] = self.label or False
|
||||
if self.labelHint:
|
||||
data['labelHint'] = self.labelHint or False
|
||||
return data
|
||||
|
||||
|
||||
class NameBaseField(BaseField):
|
||||
name = None
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(NameBaseField, self).serialize(name)
|
||||
if name:
|
||||
data['name'] = name
|
||||
elif self.name:
|
||||
data['name'] = self.name
|
||||
else:
|
||||
raise MissingNameException(cls=self.__class__)
|
||||
return data
|
||||
|
||||
|
||||
class SimpleText(BaseField):
|
||||
contentType = 'text'
|
||||
|
||||
def __init__(self, content, label=None, labelHint=None, width=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
self.content = content
|
||||
self.width = width
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(SimpleText, self).serialize(name)
|
||||
data['contentType'] = self.contentType
|
||||
data['content'] = self.content
|
||||
|
||||
if self.width:
|
||||
if self.width not in [100, 50, 33, 25]:
|
||||
raise InvalidWidthException(self.width)
|
||||
data['inputOptions'] = {}
|
||||
data['width'] = self.width
|
||||
return data
|
||||
|
||||
|
||||
class Image(BaseField):
|
||||
contentType = 'image'
|
||||
|
||||
def __init__(self, imageUrl, label=None, labelHint=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
self.imageUrl = imageUrl
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(Image, self).serialize(name)
|
||||
data['contentType'] = self.contentType
|
||||
data['content'] = self.imageUrl
|
||||
return data
|
||||
|
||||
|
||||
class WebLink(BaseField):
|
||||
contentType = 'link'
|
||||
|
||||
def __init__(self, linkText, linkUrl, label=None, labelHint=None, width=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
self.linkText = linkText
|
||||
self.linkUrl = linkUrl
|
||||
|
||||
self.width = width
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(WebLink, self).serialize(name)
|
||||
data['contentType'] = self.contentType
|
||||
|
||||
if self.width:
|
||||
if self.width not in [100, 50, 33, 25]:
|
||||
raise InvalidWidthException(self.width)
|
||||
data['inputOptions'] = {}
|
||||
data['width'] = self.width
|
||||
|
||||
data.update({'content': {'url': self.linkUrl,
|
||||
'text': self.linkText}})
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class TextInput(NameBaseField):
|
||||
def __init__(self, placeHolder=None, label=None, labelHint=None, width=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
self.placeHolder = placeHolder
|
||||
|
||||
self.width = width
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(TextInput, self).serialize(name)
|
||||
data['inputType'] = 'text'
|
||||
|
||||
data['inputOptions'] = {}
|
||||
|
||||
if self.width:
|
||||
if self.width not in [100, 50, 33, 25]:
|
||||
raise InvalidWidthException(self.width)
|
||||
|
||||
data['inputOptions']['width'] = str(self.width)
|
||||
|
||||
if self.placeHolder:
|
||||
data['inputOptions']['placeHolder'] = self.placeHolder
|
||||
return data
|
||||
|
||||
|
||||
class Textarea(NameBaseField):
|
||||
def __init__(self, placeHolder=None, rows=None, label=None, width=None, labelHint=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
self.placeHolder = placeHolder
|
||||
self.rows = rows
|
||||
self.width = width
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(Textarea, self).serialize(name)
|
||||
data['inputType'] = 'textarea'
|
||||
data['inputOptions'] = {}
|
||||
if self.rows:
|
||||
data['inputOptions']['rows'] = str(self.rows)
|
||||
if self.placeHolder:
|
||||
data['inputOptions']['placeHolder'] = self.placeHolder
|
||||
if self.width:
|
||||
data['inputOptions']['width'] = str(self.width)
|
||||
return data
|
||||
|
||||
|
||||
class Checkbox(NameBaseField):
|
||||
def __init__(self, text, label=None, labelHint=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
self.text = text
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(Checkbox, self).serialize(name)
|
||||
data['inputType'] = 'checkbox'
|
||||
data['inputOptions'] = {'label': self.text}
|
||||
return data
|
||||
|
||||
|
||||
class Select(NameBaseField):
|
||||
type = 'select'
|
||||
|
||||
def __init__(self, label=None, choices=None, labelHint=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
self.choices = choices or ()
|
||||
|
||||
def get_choices(self):
|
||||
for choice in self.choices:
|
||||
if isinstance(choice, six.text_type):
|
||||
yield choice, choice
|
||||
else:
|
||||
yield choice
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(Select, self).serialize(name)
|
||||
data['inputType'] = self.type
|
||||
|
||||
data['inputOptions'] = []
|
||||
for value, caption in self.get_choices():
|
||||
data['inputOptions'].append({"value": value,
|
||||
"caption": caption})
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class Radio(Select):
|
||||
type = 'radio'
|
||||
|
||||
|
||||
class ImageUpload(NameBaseField):
|
||||
def __init__(self, label=None, labelHint=None):
|
||||
self.label = label
|
||||
self.labelHint = labelHint
|
||||
|
||||
def serialize(self, name=None):
|
||||
data = super(ImageUpload, self).serialize(name)
|
||||
data['inputType'] = 'imageUpload'
|
||||
return data
|
@ -0,0 +1,28 @@
|
||||
from six.moves.urllib_parse import urlparse
|
||||
|
||||
|
||||
class Proxy(object):
|
||||
def __init__(self, proxy_type, proxy_address, proxy_port, proxy_login, proxy_password):
|
||||
self.proxyType = proxy_type
|
||||
self.proxyAddress = proxy_address
|
||||
self.proxyPort = proxy_port
|
||||
self.proxyLogin = proxy_login
|
||||
self.proxyPassword = proxy_password
|
||||
|
||||
def serialize(self):
|
||||
result = {'proxyType': self.proxyType,
|
||||
'proxyAddress': self.proxyAddress,
|
||||
'proxyPort': self.proxyPort}
|
||||
if self.proxyLogin or self.proxyPassword:
|
||||
result['proxyLogin'] = self.proxyLogin
|
||||
result['proxyPassword'] = self.proxyPassword
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def parse_url(cls, url):
|
||||
parsed = urlparse(url)
|
||||
return cls(proxy_type=parsed.scheme,
|
||||
proxy_address=parsed.hostname,
|
||||
proxy_port=parsed.port,
|
||||
proxy_login=parsed.username,
|
||||
proxy_password=parsed.password)
|
@ -0,0 +1,128 @@
|
||||
import base64
|
||||
from .fields import BaseField
|
||||
|
||||
|
||||
class BaseTask(object):
|
||||
def serialize(self, **result):
|
||||
return result
|
||||
|
||||
|
||||
class ProxyMixin(BaseTask):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.proxy = kwargs.pop('proxy')
|
||||
self.userAgent = kwargs.pop('user_agent')
|
||||
self.cookies = kwargs.pop('cookies', '')
|
||||
super(ProxyMixin, self).__init__(*args, **kwargs)
|
||||
|
||||
def serialize(self, **result):
|
||||
result = super(ProxyMixin, self).serialize(**result)
|
||||
result.update(self.proxy.serialize())
|
||||
result['userAgent'] = self.userAgent
|
||||
if self.cookies:
|
||||
result['cookies'] = self.cookies
|
||||
return result
|
||||
|
||||
|
||||
class NoCaptchaTaskProxylessTask(BaseTask):
|
||||
type = "NoCaptchaTaskProxyless"
|
||||
websiteURL = None
|
||||
websiteKey = None
|
||||
websiteSToken = None
|
||||
|
||||
def __init__(self, website_url, website_key, website_s_token=None, is_invisible=None):
|
||||
self.websiteURL = website_url
|
||||
self.websiteKey = website_key
|
||||
self.websiteSToken = website_s_token
|
||||
self.isInvisible = is_invisible
|
||||
|
||||
def serialize(self):
|
||||
data = {'type': self.type,
|
||||
'websiteURL': self.websiteURL,
|
||||
'websiteKey': self.websiteKey}
|
||||
if self.websiteSToken is not None:
|
||||
data['websiteSToken'] = self.websiteSToken
|
||||
if self.isInvisible is not None:
|
||||
data['isInvisible'] = self.isInvisible
|
||||
return data
|
||||
|
||||
|
||||
class FunCaptchaTask(ProxyMixin):
|
||||
type = "FunCaptchaTask"
|
||||
websiteURL = None
|
||||
websiteKey = None
|
||||
|
||||
def __init__(self, website_url, website_key, *args, **kwargs):
|
||||
self.websiteURL = website_url
|
||||
self.websiteKey = website_key
|
||||
super(FunCaptchaTask, self).__init__(*args, **kwargs)
|
||||
|
||||
def serialize(self, **result):
|
||||
result = super(FunCaptchaTask, self).serialize(**result)
|
||||
result.update({'type': self.type,
|
||||
'websiteURL': self.websiteURL,
|
||||
'websitePublicKey': self.websiteKey})
|
||||
return result
|
||||
|
||||
|
||||
class NoCaptchaTask(ProxyMixin, NoCaptchaTaskProxylessTask):
|
||||
type = "NoCaptchaTask"
|
||||
|
||||
|
||||
class ImageToTextTask(object):
|
||||
type = "ImageToTextTask"
|
||||
fp = None
|
||||
phrase = None
|
||||
case = None
|
||||
numeric = None
|
||||
math = None
|
||||
minLength = None
|
||||
maxLength = None
|
||||
|
||||
def __init__(self, fp, phrase=None, case=None, numeric=None, math=None, min_length=None, max_length=None):
|
||||
self.fp = fp
|
||||
self.phrase = phrase
|
||||
self.case = case
|
||||
self.numeric = numeric
|
||||
self.math = math
|
||||
self.minLength = min_length
|
||||
self.maxLength = max_length
|
||||
|
||||
def serialize(self):
|
||||
return {'type': self.type,
|
||||
'body': base64.b64encode(self.fp.read()).decode('utf-8'),
|
||||
'phrase': self.phrase,
|
||||
'case': self.case,
|
||||
'numeric': self.numeric,
|
||||
'math': self.math,
|
||||
'minLength': self.minLength,
|
||||
'maxLength': self.maxLength}
|
||||
|
||||
|
||||
class CustomCaptchaTask(BaseTask):
|
||||
type = 'CustomCaptchaTask'
|
||||
imageUrl = None
|
||||
assignment = None
|
||||
form = None
|
||||
|
||||
def __init__(self, imageUrl, form=None, assignment=None):
|
||||
self.imageUrl = imageUrl
|
||||
self.form = form or {}
|
||||
self.assignment = assignment
|
||||
|
||||
def serialize(self):
|
||||
data = super(CustomCaptchaTask, self).serialize()
|
||||
data.update({'type': self.type,
|
||||
'imageUrl': self.imageUrl})
|
||||
if self.form:
|
||||
forms = []
|
||||
for name, field in self.form.items():
|
||||
if isinstance(field, BaseField):
|
||||
forms.append(field.serialize(name))
|
||||
else:
|
||||
field = field.copy()
|
||||
field['name'] = name
|
||||
forms.append(field)
|
||||
data['forms'] = forms
|
||||
if self.assignment:
|
||||
data['assignment'] = self.assignment
|
||||
return data
|
@ -0,0 +1,257 @@
|
||||
# coding=utf-8
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
from subliminal.cache import region
|
||||
from dogpile.cache.api import NO_VALUE
|
||||
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
|
||||
Proxy
|
||||
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PitcherRegistry(object):
|
||||
pitchers = []
|
||||
pitchers_by_key = {}
|
||||
|
||||
def register(self, cls):
|
||||
idx = len(self.pitchers)
|
||||
self.pitchers.append(cls)
|
||||
key = "%s_%s" % (cls.name, cls.needs_proxy)
|
||||
key_by_source = "%s_%s" % (cls.source, cls.needs_proxy)
|
||||
self.pitchers_by_key[key] = idx
|
||||
self.pitchers_by_key[key_by_source] = idx
|
||||
return cls
|
||||
|
||||
def get_pitcher(self, name_or_site=None, with_proxy=False):
|
||||
name_or_site = name_or_site or os.environ.get("ANTICAPTCHA_CLASS")
|
||||
if not name_or_site:
|
||||
raise Exception("AntiCaptcha class not given, exiting")
|
||||
|
||||
key = "%s_%s" % (name_or_site, with_proxy)
|
||||
|
||||
if key not in self.pitchers_by_key:
|
||||
raise Exception("Pitcher %s not found (proxy: %s)" % (name_or_site, with_proxy))
|
||||
|
||||
return self.pitchers[self.pitchers_by_key.get(key)]
|
||||
|
||||
|
||||
registry = pitchers = PitcherRegistry()
|
||||
|
||||
|
||||
class Pitcher(object):
|
||||
name = None
|
||||
source = None
|
||||
needs_proxy = False
|
||||
tries = 3
|
||||
job = None
|
||||
client = None
|
||||
client_key = None
|
||||
website_url = None
|
||||
website_key = None
|
||||
website_name = None
|
||||
solve_time = None
|
||||
success = False
|
||||
|
||||
def __init__(self, website_name, website_url, website_key, tries=3, client_key=None, *args, **kwargs):
|
||||
self.tries = tries
|
||||
self.client_key = client_key or os.environ.get("ANTICAPTCHA_ACCOUNT_KEY")
|
||||
if not self.client_key:
|
||||
raise Exception("AntiCaptcha key not given, exiting")
|
||||
|
||||
self.website_name = website_name
|
||||
self.website_key = website_key
|
||||
self.website_url = website_url
|
||||
self.success = False
|
||||
self.solve_time = None
|
||||
|
||||
def get_client(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_job(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _throw(self):
|
||||
self.client = self.get_client()
|
||||
self.job = self.get_job()
|
||||
|
||||
def throw(self):
|
||||
t = time.time()
|
||||
data = self._throw()
|
||||
if self.success:
|
||||
self.solve_time = time.time() - t
|
||||
logger.info("%s: Solving took %ss", self.website_name, int(self.solve_time))
|
||||
return data
|
||||
|
||||
|
||||
@registry.register
|
||||
class AntiCaptchaProxyLessPitcher(Pitcher):
|
||||
name = "AntiCaptchaProxyLess"
|
||||
source = "anti-captcha.com"
|
||||
host = "api.anti-captcha.com"
|
||||
language_pool = "en"
|
||||
tries = 5
|
||||
use_ssl = True
|
||||
is_invisible = False
|
||||
|
||||
def __init__(self, website_name, website_url, website_key, tries=3, host=None, language_pool=None,
|
||||
use_ssl=True, is_invisible=False, *args, **kwargs):
|
||||
super(AntiCaptchaProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries, *args,
|
||||
**kwargs)
|
||||
self.host = host or self.host
|
||||
self.language_pool = language_pool or self.language_pool
|
||||
self.use_ssl = use_ssl
|
||||
self.is_invisible = is_invisible
|
||||
|
||||
def get_client(self):
|
||||
return AnticaptchaClient(self.client_key, self.language_pool, self.host, self.use_ssl)
|
||||
|
||||
def get_job(self):
|
||||
task = NoCaptchaTaskProxylessTask(website_url=self.website_url, website_key=self.website_key,
|
||||
is_invisible=self.is_invisible)
|
||||
return self.client.createTask(task)
|
||||
|
||||
def _throw(self):
|
||||
for i in range(self.tries):
|
||||
try:
|
||||
super(AntiCaptchaProxyLessPitcher, self)._throw()
|
||||
self.job.join()
|
||||
ret = self.job.get_solution_response()
|
||||
if ret:
|
||||
self.success = True
|
||||
return ret
|
||||
except AnticaptchaException as e:
|
||||
if i >= self.tries - 1:
|
||||
logger.error("%s: Captcha solving finally failed. Exiting", self.website_name)
|
||||
return
|
||||
|
||||
if e.error_code == 'ERROR_ZERO_BALANCE':
|
||||
logger.error("%s: No balance left on captcha solving service. Exiting", self.website_name)
|
||||
return
|
||||
|
||||
elif e.error_code == 'ERROR_NO_SLOT_AVAILABLE':
|
||||
logger.info("%s: No captcha solving slot available, retrying", self.website_name)
|
||||
time.sleep(5.0)
|
||||
continue
|
||||
|
||||
elif e.error_code == 'ERROR_KEY_DOES_NOT_EXIST':
|
||||
logger.error("%s: Bad AntiCaptcha API key", self.website_name)
|
||||
return
|
||||
|
||||
elif e.error_id is None and e.error_code == 250:
|
||||
# timeout
|
||||
if i < self.tries:
|
||||
logger.info("%s: Captcha solving timed out, retrying", self.website_name)
|
||||
time.sleep(1.0)
|
||||
continue
|
||||
else:
|
||||
logger.error("%s: Captcha solving timed out three times; bailing out", self.website_name)
|
||||
return
|
||||
raise
|
||||
|
||||
|
||||
@registry.register
|
||||
class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher):
|
||||
name = "AntiCaptcha"
|
||||
proxy = None
|
||||
needs_proxy = True
|
||||
user_agent = None
|
||||
cookies = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.proxy = Proxy.parse_url(kwargs.pop("proxy"))
|
||||
self.user_agent = kwargs.pop("user_agent")
|
||||
cookies = kwargs.pop("cookies", {})
|
||||
if isinstance(cookies, dict):
|
||||
self.cookies = ";".join(["%s=%s" % (k, v) for k, v in cookies.iteritems()])
|
||||
|
||||
super(AntiCaptchaPitcher, self).__init__(*args, **kwargs)
|
||||
|
||||
def get_job(self):
|
||||
task = NoCaptchaTask(website_url=self.website_url, website_key=self.website_key, proxy=self.proxy,
|
||||
user_agent=self.user_agent, cookies=self.cookies, is_invisible=self.is_invisible)
|
||||
return self.client.createTask(task)
|
||||
|
||||
|
||||
@registry.register
|
||||
class DBCProxyLessPitcher(Pitcher):
|
||||
name = "DeathByCaptchaProxyLess"
|
||||
source = "deathbycaptcha.com"
|
||||
username = None
|
||||
password = None
|
||||
|
||||
def __init__(self, website_name, website_url, website_key,
|
||||
timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs):
|
||||
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
|
||||
|
||||
self.username, self.password = self.client_key.split(":", 1)
|
||||
self.timeout = timeout
|
||||
|
||||
def get_client(self):
|
||||
return DBCClient(self.username, self.password)
|
||||
|
||||
def get_job(self):
|
||||
pass
|
||||
|
||||
@property
|
||||
def payload_dict(self):
|
||||
return {
|
||||
"googlekey": self.website_key,
|
||||
"pageurl": self.website_url
|
||||
}
|
||||
|
||||
def _throw(self):
|
||||
super(DBCProxyLessPitcher, self)._throw()
|
||||
payload = json.dumps(self.payload_dict)
|
||||
for i in range(self.tries):
|
||||
try:
|
||||
#balance = self.client.get_balance()
|
||||
data = self.client.decode(timeout=self.timeout, type=4, token_params=payload)
|
||||
if data and data["is_correct"] and data["text"]:
|
||||
self.success = True
|
||||
return data["text"]
|
||||
except:
|
||||
raise
|
||||
|
||||
|
||||
@registry.register
|
||||
class DBCPitcher(DBCProxyLessPitcher):
|
||||
name = "DeathByCaptcha"
|
||||
proxy = None
|
||||
needs_proxy = True
|
||||
proxy_type = "HTTP"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.proxy = kwargs.pop("proxy")
|
||||
super(DBCPitcher, self).__init__(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def payload_dict(self):
|
||||
payload = super(DBCPitcher, self).payload_dict
|
||||
payload.update({
|
||||
"proxytype": self.proxy_type,
|
||||
"proxy": self.proxy
|
||||
})
|
||||
return payload
|
||||
|
||||
|
||||
def load_verification(site_name, session, callback=lambda x: None):
|
||||
ccks = region.get("%s_data" % site_name, expiration_time=15552000) # 6m
|
||||
if ccks != NO_VALUE:
|
||||
cookies, user_agent = ccks
|
||||
logger.debug("%s: Re-using previous user agent: %s", site_name.capitalize(), user_agent)
|
||||
session.headers["User-Agent"] = user_agent
|
||||
try:
|
||||
session.cookies._cookies.update(cookies)
|
||||
return callback(region)
|
||||
except:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def store_verification(site_name, session):
|
||||
region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"]))
|
@ -0,0 +1,208 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import rarfile
|
||||
from subzero.language import Language
|
||||
from guessit import guessit
|
||||
from requests import Session
|
||||
from six import text_type
|
||||
|
||||
from subliminal import __short_version__
|
||||
from subliminal.providers import ParserBeautifulSoup, Provider
|
||||
from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches
|
||||
from subliminal.video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ZimukuSubtitle(Subtitle):
|
||||
"""Zimuku Subtitle."""
|
||||
provider_name = 'zimuku'
|
||||
|
||||
def __init__(self, language, page_link, version, download_link):
|
||||
super(ZimukuSubtitle, self).__init__(language, page_link=page_link)
|
||||
self.version = version
|
||||
self.download_link = download_link
|
||||
self.hearing_impaired = None
|
||||
self.encoding = 'utf-8'
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.download_link
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# episode
|
||||
if isinstance(video, Episode):
|
||||
# other properties
|
||||
matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True)
|
||||
# movie
|
||||
elif isinstance(video, Movie):
|
||||
# other properties
|
||||
matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class ZimukuProvider(Provider):
|
||||
"""Zimuku Provider."""
|
||||
languages = {Language(l) for l in ['zho', 'eng']}
|
||||
|
||||
server_url = 'http://www.zimuku.la'
|
||||
search_url = '/search?q={}'
|
||||
download_url = 'http://www.zimuku.la/'
|
||||
|
||||
UserAgent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
|
||||
|
||||
subtitle_class = ZimukuSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
def query(self, keyword, season=None, episode=None, year=None):
|
||||
params = keyword
|
||||
if season and episode:
|
||||
params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode)
|
||||
elif year:
|
||||
params += ' {:4d}'.format(year)
|
||||
|
||||
logger.debug('Searching subtitles %r', params)
|
||||
subtitles = []
|
||||
search_link = self.server_url + text_type(self.search_url).format(params)
|
||||
|
||||
r = self.session.get(search_link, timeout=30)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
logger.debug('No data returned from provider')
|
||||
return []
|
||||
|
||||
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
|
||||
|
||||
for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'):
|
||||
moviename = entity.text
|
||||
entity_url = self.server_url + entity['href']
|
||||
logger.debug(entity_url)
|
||||
r = self.session.get(entity_url, timeout=30)
|
||||
r.raise_for_status()
|
||||
logger.debug('looking into ' + entity_url)
|
||||
|
||||
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("div", class_="subs box clearfix")
|
||||
# loop over subtitles cells
|
||||
|
||||
subs = soup.tbody.find_all("tr")
|
||||
for sub in subs:
|
||||
page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8'))
|
||||
version = sub.a.text.encode('utf-8') or None
|
||||
if version is None:
|
||||
version = ""
|
||||
try:
|
||||
td = sub.find("td", class_="tac lang")
|
||||
r2 = td.find_all("img")
|
||||
langs = [x.get('title').encode('utf-8') for x in r2]
|
||||
except:
|
||||
langs = '未知'
|
||||
name = '%s (%s)' % (version, ",".join(langs))
|
||||
|
||||
if ('English' in langs) and not(('简体中文' in langs) or ('繁體中文' in langs)):
|
||||
language = Language('eng')
|
||||
else:
|
||||
language = Language('zho')
|
||||
# read the item
|
||||
subtitle = self.subtitle_class(language, page_link, version, page_link.replace("detail","dld"))
|
||||
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
if isinstance(video, Episode):
|
||||
titles = [video.series] + video.alternative_series
|
||||
elif isinstance(video, Movie):
|
||||
titles = [video.title] + video.alternative_titles
|
||||
else:
|
||||
titles = []
|
||||
|
||||
subtitles = []
|
||||
# query for subtitles with the show_id
|
||||
for title in titles:
|
||||
if isinstance(video, Episode):
|
||||
subtitles += [s for s in self.query(title, season=video.season, episode=video.episode,
|
||||
year=video.year)
|
||||
if s.language in languages]
|
||||
elif isinstance(video, Movie):
|
||||
subtitles += [s for s in self.query(title, year=video.year)
|
||||
if s.language in languages]
|
||||
|
||||
return subtitles
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
if isinstance(subtitle, ZimukuSubtitle):
|
||||
# download the subtitle
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link},
|
||||
timeout=30)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
logger.debug('Unable to download subtitle. No data returned from provider')
|
||||
return
|
||||
|
||||
soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
|
||||
links = soup.find("div", {"class":"clearfix"}).find_all('a')
|
||||
# TODO: add settings for choice
|
||||
|
||||
for down_link in links:
|
||||
url = down_link.get('href').encode('utf-8')
|
||||
url = self.server_url + url
|
||||
r = self.session.get(url, headers={'Referer': subtitle.download_link},
|
||||
timeout=30)
|
||||
r.raise_for_status()
|
||||
|
||||
if len(r.content) > 1024:
|
||||
break
|
||||
|
||||
archive_stream = io.BytesIO(r.content)
|
||||
archive = None
|
||||
if rarfile.is_rarfile(archive_stream):
|
||||
logger.debug('Identified rar archive')
|
||||
archive = rarfile.RarFile(archive_stream)
|
||||
subtitle_content = _get_subtitle_from_archive(archive)
|
||||
elif zipfile.is_zipfile(archive_stream):
|
||||
logger.debug('Identified zip archive')
|
||||
archive = zipfile.ZipFile(archive_stream)
|
||||
subtitle_content = _get_subtitle_from_archive(archive)
|
||||
else:
|
||||
subtitle_content = r.content
|
||||
|
||||
if subtitle_content:
|
||||
subtitle.content = fix_line_ending(subtitle_content)
|
||||
else:
|
||||
logger.debug('Could not extract subtitle from %r', archive)
|
||||
|
||||
|
||||
def _get_subtitle_from_archive(archive):
|
||||
for name in archive.namelist():
|
||||
# discard hidden files
|
||||
if os.path.split(name)[-1].startswith('.'):
|
||||
continue
|
||||
|
||||
# discard non-subtitle files
|
||||
if not name.lower().endswith(SUBTITLE_EXTENSIONS):
|
||||
continue
|
||||
|
||||
return archive.read(name)
|
||||
|
||||
return None
|
Loading…
Reference in new issue