pull/684/head
Louis Vézina 5 years ago
parent 4e7e3a39d2
commit 645952c61a

@ -1,5 +1,7 @@
# coding=utf-8
from __future__ import absolute_import
from __future__ import print_function
import subprocess as sp
import time
import os
@ -12,14 +14,16 @@ from bazarr.get_args import args
def check_python_version():
python_version = platform.python_version_tuple()
minimum_python_version_tuple = (2, 7, 13)
minimum_python3_version_tuple = (3, 6, 0)
minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple)
minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple)
if int(python_version[0]) > minimum_python_version_tuple[0]:
print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater."
if int(python_version[0]) == minimum_python3_version_tuple[0] and int(python_version[1]) < minimum_python3_version_tuple[1]:
print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
os._exit(0)
elif int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]:
print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python."
elif int(python_version[0]) == minimum_python_version_tuple[0] and (int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]):
print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
os._exit(0)
@ -32,10 +36,10 @@ def start_bazarr():
script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:]
ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE)
print "Bazarr starting..."
print("Bazarr starting...")
try:
for line in iter(ep.stdout.readline, ''):
sys.stdout.write(line)
sys.stdout.buffer.write(line)
except KeyboardInterrupt:
pass
@ -60,16 +64,16 @@ if __name__ == '__main__':
try:
os.remove(stopfile)
except:
print 'Unable to delete stop file.'
print('Unable to delete stop file.')
else:
print 'Bazarr exited.'
print('Bazarr exited.')
os._exit(0)
if os.path.exists(restartfile):
try:
os.remove(restartfile)
except:
print 'Unable to delete restart file.'
print('Unable to delete restart file.')
else:
start_bazarr()

@ -1,6 +1,7 @@
# coding=utf-8
import cPickle as pickle
from __future__ import absolute_import
import six.moves.cPickle as pickle
import base64
import random
import platform
@ -30,7 +31,7 @@ def track_event(category=None, action=None, label=None):
visitor = pickle.loads(base64.b64decode(settings.analytics.visitor))
except:
visitor = Visitor()
unique_id = long(random.getrandbits(32))
unique_id = int(random.getrandbits(32))
visitor.unique_id = unique_id
session = Session()

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import os
import logging
import json

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import os
from simpleconfigparser import simpleconfigparser

@ -1,3 +1,4 @@
from __future__ import absolute_import
import os
import atexit

@ -1,3 +1,4 @@
from __future__ import absolute_import
import enzyme
import logging
import os

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import os
import argparse

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import os
import requests
import logging

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import os
import pycountry

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import os
import requests
import logging
@ -13,6 +14,7 @@ from list_subtitles import store_subtitles_movie, list_missing_subtitles_movies,
from get_subtitle import movies_download_subtitles
from database import TableMovies, wal_cleaning
import six
def update_all_movies():
@ -82,7 +84,7 @@ def update_movies():
if movie["path"] != None and movie['movieFile']['relativePath'] != None:
try:
overview = unicode(movie['overview'])
overview = six.text_type(movie['overview'])
except:
overview = ""
try:
@ -136,27 +138,27 @@ def update_movies():
audioCodec = None
# Add movies in radarr to current movies list
current_movies_radarr.append(unicode(movie['tmdbId']))
current_movies_radarr.append(six.text_type(movie['tmdbId']))
if unicode(movie['tmdbId']) in current_movies_db_list:
if six.text_type(movie['tmdbId']) in current_movies_db_list:
movies_to_update.append({'radarr_id': movie["id"],
'title': unicode(movie["title"]),
'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']),
'tmdb_id': unicode(movie["tmdbId"]),
'poster': unicode(poster),
'fanart': unicode(fanart),
'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
'title': six.text_type(movie["title"]),
'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']),
'tmdb_id': six.text_type(movie["tmdbId"]),
'poster': six.text_type(poster),
'fanart': six.text_type(fanart),
'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])),
'year': unicode(movie['year']),
'sort_title': unicode(movie['sortTitle']),
'alternative_titles': unicode(alternativeTitles),
'format': unicode(format),
'resolution': unicode(resolution),
'video_codec': unicode(videoCodec),
'audio_codec': unicode(audioCodec),
'overview': unicode(overview),
'imdb_id': unicode(imdbId)})
'monitored': six.text_type(bool(movie['monitored'])),
'year': six.text_type(movie['year']),
'sort_title': six.text_type(movie['sortTitle']),
'alternative_titles': six.text_type(alternativeTitles),
'format': six.text_type(format),
'resolution': six.text_type(resolution),
'video_codec': six.text_type(videoCodec),
'audio_codec': six.text_type(audioCodec),
'overview': six.text_type(overview),
'imdb_id': six.text_type(imdbId)})
else:
if movie_default_enabled is True:
movies_to_add.append({'radarr_id': movie["id"],
@ -171,7 +173,7 @@ def update_movies():
'fanart': fanart,
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])),
'monitored': six.text_type(bool(movie['monitored'])),
'sort_title': movie['sortTitle'],
'year': movie['year'],
'alternative_titles': alternativeTitles,
@ -191,7 +193,7 @@ def update_movies():
'fanart': fanart,
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])),
'monitored': six.text_type(bool(movie['monitored'])),
'sort_title': movie['sortTitle'],
'year': movie['year'],
'alternative_titles': alternativeTitles,

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import os
import datetime
import logging
@ -159,8 +160,8 @@ def provider_throttle(name, exception):
def throttled_count(name):
global throttle_count
if name in throttle_count.keys():
if 'count' in throttle_count[name].keys():
if name in list(throttle_count.keys()):
if 'count' in list(throttle_count[name].keys()):
for key, value in throttle_count[name].items():
if key == 'count':
value += 1

@ -1,5 +1,7 @@
# coding=utf-8
from __future__ import absolute_import
from __future__ import print_function
import os
import requests
import logging
@ -12,6 +14,7 @@ from config import settings, url_sonarr
from list_subtitles import list_missing_subtitles
from database import TableShows
from utils import get_sonarr_version
import six
def update_series():
@ -60,7 +63,7 @@ def update_series():
for i, show in enumerate(r.json(), 1):
notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength)
try:
overview = unicode(show['overview'])
overview = six.text_type(show['overview'])
except:
overview = ""
try:
@ -82,17 +85,17 @@ def update_series():
current_shows_sonarr.append(show['tvdbId'])
if show['tvdbId'] in current_shows_db_list:
series_to_update.append({'title': unicode(show["title"]),
'path': unicode(show["path"]),
series_to_update.append({'title': six.text_type(show["title"]),
'path': six.text_type(show["path"]),
'tvdb_id': int(show["tvdbId"]),
'sonarr_series_id': int(show["id"]),
'overview': unicode(overview),
'poster': unicode(poster),
'fanart': unicode(fanart),
'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
'sort_title': unicode(show['sortTitle']),
'year': unicode(show['year']),
'alternate_titles': unicode(alternateTitles)})
'overview': six.text_type(overview),
'poster': six.text_type(poster),
'fanart': six.text_type(fanart),
'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
'sort_title': six.text_type(show['sortTitle']),
'year': six.text_type(show['year']),
'alternate_titles': six.text_type(alternateTitles)})
else:
if serie_default_enabled is True:
series_to_add.append({'title': show["title"],
@ -161,9 +164,9 @@ def update_series():
removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr))
for series in removed_series:
print TableShows.delete().where(
print(TableShows.delete().where(
TableShows.tvdb_id == series
).execute()
).execute())
logging.debug('BAZARR All series synced from Sonarr into database.')

@ -1,12 +1,13 @@
# coding=utf-8
from __future__ import absolute_import
import os
import sys
import ast
import logging
import subprocess
import time
import cPickle as pickle
import six.moves.cPickle as pickle
import codecs
import types
import re
@ -37,6 +38,9 @@ from database import TableShows, TableEpisodes, TableMovies, TableHistory, Table
from peewee import fn, JOIN
from analytics import track_event
import six
from six.moves import range
from functools import reduce
def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"):
@ -91,11 +95,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco
"""
max_score = 120.0
min_score = max_score * min_score_movie_perc / 100.0
scores = subliminal_scores.movie_scores.keys()
scores = list(subliminal_scores.movie_scores.keys())
if media_type == "series":
max_score = 360.0
min_score = max_score * min_score_series_perc / 100.0
scores = subliminal_scores.episode_scores.keys()
scores = list(subliminal_scores.episode_scores.keys())
if video.is_special:
min_score = max_score * min_score_special_ep / 100.0
@ -119,7 +123,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
hi = "force non-HI"
language_set = set()
if not isinstance(language, types.ListType):
if not isinstance(language, list):
language = [language]
if forced == "True":
@ -185,7 +189,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
saved_any = False
if downloaded_subtitles:
for video, subtitles in downloaded_subtitles.iteritems():
for video, subtitles in six.iteritems(downloaded_subtitles):
if not subtitles:
continue
@ -221,10 +225,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
else:
action = "downloaded"
if video.used_scene_name:
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName
else:
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing."
if use_postprocessing is True:
@ -444,7 +448,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
downloaded_path = saved_subtitle.storage_path
logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path)
is_forced_string = " forced" if subtitle.language.forced else ""
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + unicode(
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type(
score) + "% using manual search."
if use_postprocessing is True:
@ -749,7 +753,7 @@ def wanted_download_subtitles(path, l, count_episodes):
for episode in episodes_details:
attempt = episode.failed_attempts
if type(attempt) == unicode:
if type(attempt) == six.text_type:
attempt = ast.literal_eval(attempt)
for language in ast.literal_eval(episode.missing_subtitles):
if attempt is None:
@ -762,7 +766,7 @@ def wanted_download_subtitles(path, l, count_episodes):
TableEpisodes.update(
{
TableEpisodes.failed_attempts: unicode(attempt)
TableEpisodes.failed_attempts: six.text_type(attempt)
}
).where(
TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id
@ -818,7 +822,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
for movie in movies_details:
attempt = movie.failed_attempts
if type(attempt) == unicode:
if type(attempt) == six.text_type:
attempt = ast.literal_eval(attempt)
for language in ast.literal_eval(movie.missing_subtitles):
if attempt is None:
@ -831,7 +835,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
TableMovies.update(
{
TableMovies.failed_attempts: unicode(attempt)
TableMovies.failed_attempts: six.text_type(attempt)
}
).where(
TableMovies.radarr_id == movie.radarr_id
@ -991,7 +995,7 @@ def refine_from_db(path, video):
TableMovies.audio_codec,
TableMovies.imdb_id
).where(
TableMovies.path == unicode(path_replace_reverse_movie(path))
TableMovies.path == six.text_type(path_replace_reverse_movie(path))
).first()
if data:

@ -1,4 +1,5 @@
# coding=utf-8
from __future__ import absolute_import
import ast
import os
import re
@ -126,7 +127,7 @@ def force_unicode(s):
:param s: string
:return: unicode string
"""
if not isinstance(s, types.UnicodeType):
if not isinstance(s, str):
try:
s = s.decode("utf-8")
except UnicodeDecodeError:

@ -1,12 +1,13 @@
# coding=utf-8
from __future__ import absolute_import
import os
import logging
import time
import rarfile
from cork import Cork
from ConfigParser2 import ConfigParser
from backports import configparser2
from config import settings
from check_update import check_releases
from get_args import args
@ -66,7 +67,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')):
config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini'))
cfg = ConfigParser()
cfg = configparser2.ConfigParser()
def init_binaries():

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import os
import sys

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import gc
import os
import babelfish
@ -24,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
from queueconfig import notifications
from embedded_subs_reader import embedded_subs_reader
import six
gc.enable()
@ -63,7 +65,7 @@ def store_subtitles(file):
logging.exception("BAZARR unable to index external subtitles.")
pass
else:
for subtitle, language in subtitles.iteritems():
for subtitle, language in six.iteritems(subtitles):
subtitle_path = get_external_subtitles_path(file, subtitle)
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)):
logging.debug("BAZARR external subtitles detected: " + "pb")
@ -155,7 +157,7 @@ def store_subtitles_movie(file):
logging.exception("BAZARR unable to index external subtitles.")
pass
else:
for subtitle, language in subtitles.iteritems():
for subtitle, language in six.iteritems(subtitles):
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True:
logging.debug("BAZARR external subtitles detected: " + "pb")
actual_subtitles.append(

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import os
import logging
import re
@ -9,6 +10,7 @@ import platform
from logging.handlers import TimedRotatingFileHandler
from get_args import args
from config import settings
import six
logger = logging.getLogger()
@ -107,10 +109,10 @@ class MyFilter(logging.Filter):
class ArgsFilteringFilter(logging.Filter):
def filter_args(self, record, func):
if isinstance(record.args, (types.ListType, types.TupleType)):
if isinstance(record.args, (list, tuple)):
final_args = []
for arg in record.args:
if not isinstance(arg, basestring):
if not isinstance(arg, six.string_types):
final_args.append(arg)
continue
@ -118,7 +120,7 @@ class ArgsFilteringFilter(logging.Filter):
record.args = type(record.args)(final_args)
elif isinstance(record.args, dict):
for key, arg in record.args.items():
if not isinstance(arg, basestring):
if not isinstance(arg, six.string_types):
continue
record.args[key] = func(arg)

@ -1,5 +1,8 @@
# coding=utf-8
import six
from six.moves import zip
from functools import reduce
bazarr_version = '0.8.2'
import gc
@ -12,7 +15,7 @@ import pretty
import math
import ast
import hashlib
import urllib
import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
import warnings
import queueconfig
import platform
@ -1575,12 +1578,12 @@ def save_settings():
settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username')
settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password')
before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url),
unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')),
unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie))
after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl),
unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr),
unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie))
before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url),
six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')),
six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie))
after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl),
six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr),
six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie))
settings.general.ip = text_type(settings_general_ip)
settings.general.port = text_type(settings_general_port)
@ -1645,7 +1648,7 @@ def save_settings():
settings_proxy_password = request.forms.get('settings_proxy_password')
settings_proxy_exclude = request.forms.get('settings_proxy_exclude')
before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude))
before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude))
if before_proxy_password[0] != settings_proxy_type:
configured()
if before_proxy_password[1] == settings_proxy_password:
@ -2029,7 +2032,7 @@ def remove_subtitles():
history_log(0, sonarrSeriesId, sonarrEpisodeId, result)
except OSError as e:
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
store_subtitles(unicode(episodePath))
store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId)
@ -2048,7 +2051,7 @@ def remove_subtitles_movie():
history_log_movie(0, radarrId, result)
except OSError as e:
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
store_subtitles_movie(unicode(moviePath))
store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId)
@ -2082,7 +2085,7 @@ def get_subtitle():
score = result[4]
history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath))
store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId)
redirect(ref)
except OSError:
@ -2140,7 +2143,7 @@ def manual_get_subtitle():
score = result[4]
history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath))
store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId)
redirect(ref)
except OSError:
@ -2184,7 +2187,7 @@ def perform_manual_upload_subtitle():
score = 360
history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath))
store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId)
redirect(ref)
@ -2221,7 +2224,7 @@ def get_subtitle_movie():
score = result[4]
history_log_movie(1, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath))
store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId)
redirect(ref)
except OSError:
@ -2277,7 +2280,7 @@ def manual_get_subtitle_movie():
score = result[4]
history_log_movie(2, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath))
store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId)
redirect(ref)
except OSError:
@ -2320,7 +2323,7 @@ def perform_manual_upload_subtitle_movie():
score = 120
history_log_movie(4, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath))
store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId)
redirect(ref)
@ -2421,7 +2424,7 @@ def api_history():
@route(base_url + 'test_url/<protocol>/<url:path>', method='GET')
@custom_auth_basic(check_credentials)
def test_url(protocol, url):
url = urllib.unquote(url)
url = six.moves.urllib.parse.unquote(url)
try:
result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version']
except:
@ -2433,7 +2436,7 @@ def test_url(protocol, url):
@route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET')
@custom_auth_basic(check_credentials)
def test_notification(protocol, provider):
provider = urllib.unquote(provider)
provider = six.moves.urllib.parse.unquote(provider)
apobj = apprise.Apprise()
apobj.add(protocol + "://" + provider)

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import apprise
import os
import logging

@ -1,3 +1,4 @@
from __future__ import absolute_import
from collections import deque
import json

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
from get_episodes import sync_episodes, update_all_episodes
from get_movies import update_movies, update_all_movies
from get_series import update_series

@ -1,5 +1,6 @@
# coding=utf-8
from __future__ import absolute_import
import os
import time
import platform

@ -1,797 +0,0 @@
"""Configuration file parser.
A setup file consists of sections, lead by a "[section]" header,
and followed by "name: value" entries, with continuations and such in
the style of RFC 822.
The option values can contain format strings which refer to other values in
the same section, or values in a special [DEFAULT] section.
For example:
something: %(dir)s/whatever
would resolve the "%(dir)s" to the value of dir. All reference
expansions are done late, on demand.
Intrinsic defaults can be specified by passing them into the
ConfigParser constructor as a dictionary.
class:
ConfigParser -- responsible for parsing a list of
configuration files, and managing the parsed database.
methods:
__init__(defaults=None)
create the parser and specify a dictionary of intrinsic defaults. The
keys must be strings, the values must be appropriate for %()s string
interpolation. Note that `__name__' is always an intrinsic default;
its value is the section's name.
sections()
return all the configuration section names, sans DEFAULT
has_section(section)
return whether the given section exists
has_option(section, option)
return whether the given option exists in the given section
options(section)
return list of configuration options for the named section
read(filenames)
read and parse the list of named configuration files, given by
name. A single filename is also allowed. Non-existing files
are ignored. Return list of successfully read files.
readfp(fp, filename=None)
read and parse one configuration file, given as a file object.
The filename defaults to fp.name; it is only used in error
messages (if fp has no `name' attribute, the string `<???>' is used).
get(section, option, raw=False, vars=None)
return a string value for the named option. All % interpolations are
expanded in the return values, based on the defaults passed into the
constructor and the DEFAULT section. Additional substitutions may be
provided using the `vars' argument, which must be a dictionary whose
contents override any pre-existing defaults.
getint(section, options)
like get(), but convert value to an integer
getfloat(section, options)
like get(), but convert value to a float
getboolean(section, options)
like get(), but convert value to a boolean (currently case
insensitively defined as 0, false, no, off for False, and 1, true,
yes, on for True). Returns False or True.
items(section, raw=False, vars=None)
return a list of tuples with (name, value) for each option
in the section.
remove_section(section)
remove the given file section and all its options
remove_option(section, option)
remove the given option from the given section
set(section, option, value)
set the given option
write(fp)
write the configuration state in .ini format
"""
try:
from collections import OrderedDict as _default_dict
except ImportError:
# fallback for setup.py which hasn't yet built _collections
_default_dict = dict
import re
__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError",
"InterpolationError", "InterpolationDepthError",
"InterpolationSyntaxError", "ParsingError",
"MissingSectionHeaderError",
"ConfigParser", "SafeConfigParser", "RawConfigParser",
"DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"]
DEFAULTSECT = "DEFAULT"
MAX_INTERPOLATION_DEPTH = 10
# exception classes
class Error(Exception):
"""Base class for ConfigParser exceptions."""
def _get_message(self):
"""Getter for 'message'; needed only to override deprecation in
BaseException."""
return self.__message
def _set_message(self, value):
"""Setter for 'message'; needed only to override deprecation in
BaseException."""
self.__message = value
# BaseException.message has been deprecated since Python 2.6. To prevent
# DeprecationWarning from popping up over this pre-existing attribute, use
# a new property that takes lookup precedence.
message = property(_get_message, _set_message)
def __init__(self, msg=''):
self.message = msg
Exception.__init__(self, msg)
def __repr__(self):
return self.message
__str__ = __repr__
class NoSectionError(Error):
"""Raised when no section matches a requested option."""
def __init__(self, section):
Error.__init__(self, 'No section: %r' % (section,))
self.section = section
self.args = (section, )
class DuplicateSectionError(Error):
"""Raised when a section is multiply-created."""
def __init__(self, section):
Error.__init__(self, "Section %r already exists" % section)
self.section = section
self.args = (section, )
class NoOptionError(Error):
"""A requested option was not found."""
def __init__(self, option, section):
Error.__init__(self, "No option %r in section: %r" %
(option, section))
self.option = option
self.section = section
self.args = (option, section)
class InterpolationError(Error):
"""Base class for interpolation-related exceptions."""
def __init__(self, option, section, msg):
Error.__init__(self, msg)
self.option = option
self.section = section
self.args = (option, section, msg)
class InterpolationMissingOptionError(InterpolationError):
"""A string substitution required a setting which was not available."""
def __init__(self, option, section, rawval, reference):
msg = ("Bad value substitution:\n"
"\tsection: [%s]\n"
"\toption : %s\n"
"\tkey : %s\n"
"\trawval : %s\n"
% (section, option, reference, rawval))
InterpolationError.__init__(self, option, section, msg)
self.reference = reference
self.args = (option, section, rawval, reference)
class InterpolationSyntaxError(InterpolationError):
"""Raised when the source text into which substitutions are made
does not conform to the required syntax."""
class InterpolationDepthError(InterpolationError):
"""Raised when substitutions are nested too deeply."""
def __init__(self, option, section, rawval):
msg = ("Value interpolation too deeply recursive:\n"
"\tsection: [%s]\n"
"\toption : %s\n"
"\trawval : %s\n"
% (section, option, rawval))
InterpolationError.__init__(self, option, section, msg)
self.args = (option, section, rawval)
class ParsingError(Error):
"""Raised when a configuration file does not follow legal syntax."""
def __init__(self, filename):
Error.__init__(self, 'File contains parsing errors: %s' % filename)
self.filename = filename
self.errors = []
self.args = (filename, )
def append(self, lineno, line):
self.errors.append((lineno, line))
self.message += '\n\t[line %2d]: %s' % (lineno, line)
class MissingSectionHeaderError(ParsingError):
"""Raised when a key-value pair is found before any section header."""
def __init__(self, filename, lineno, line):
Error.__init__(
self,
'File contains no section headers.\nfile: %s, line: %d\n%r' %
(filename, lineno, line))
self.filename = filename
self.lineno = lineno
self.line = line
self.args = (filename, lineno, line)
class RawConfigParser:
def __init__(self, defaults=None, dict_type=_default_dict,
allow_no_value=False):
self._dict = dict_type
self._sections = self._dict()
self._defaults = self._dict()
if allow_no_value:
self._optcre = self.OPTCRE_NV
else:
self._optcre = self.OPTCRE
if defaults:
for key, value in defaults.items():
self._defaults[self.optionxform(key)] = value
self.comment_store = None ## used for storing comments in ini
def defaults(self):
return self._defaults
def sections(self):
"""Return a list of section names, excluding [DEFAULT]"""
# self._sections will never have [DEFAULT] in it
return self._sections.keys()
def add_section(self, section):
"""Create a new section in the configuration.
Raise DuplicateSectionError if a section by the specified name
already exists. Raise ValueError if name is DEFAULT or any of it's
case-insensitive variants.
"""
if section.lower() == "default":
raise ValueError, 'Invalid section name: %s' % section
if section in self._sections:
raise DuplicateSectionError(section)
self._sections[section] = self._dict()
def has_section(self, section):
"""Indicate whether the named section is present in the configuration.
The DEFAULT section is not acknowledged.
"""
return section in self._sections
def options(self, section):
"""Return a list of option names for the given section name."""
try:
opts = self._sections[section].copy()
except KeyError:
raise NoSectionError(section)
opts.update(self._defaults)
if '__name__' in opts:
del opts['__name__']
return opts.keys()
def read(self, filenames):
"""Read and parse a filename or a list of filenames.
Files that cannot be opened are silently ignored; this is
designed so that you can specify a list of potential
configuration file locations (e.g. current directory, user's
home directory, systemwide directory), and all existing
configuration files in the list will be read. A single
filename may also be given.
Return list of successfully read files.
"""
if isinstance(filenames, basestring):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
fp = open(filename)
except IOError:
continue
self._read(fp, filename)
fp.close()
read_ok.append(filename)
return read_ok
def readfp(self, fp, filename=None):
"""Like read() but the argument must be a file-like object.
The `fp' argument must have a `readline' method. Optional
second argument is the `filename', which if not given, is
taken from fp.name. If fp has no `name' attribute, `<???>' is
used.
"""
if filename is None:
try:
filename = fp.name
except AttributeError:
filename = '<???>'
self._read(fp, filename)
def get(self, section, option):
opt = self.optionxform(option)
if section not in self._sections:
if section != DEFAULTSECT:
raise NoSectionError(section)
if opt in self._defaults:
return self._defaults[opt]
else:
raise NoOptionError(option, section)
elif opt in self._sections[section]:
return self._sections[section][opt]
elif opt in self._defaults:
return self._defaults[opt]
else:
raise NoOptionError(option, section)
def items(self, section):
try:
d2 = self._sections[section]
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
d2 = self._dict()
d = self._defaults.copy()
d.update(d2)
if "__name__" in d:
del d["__name__"]
return d.items()
def _get(self, section, conv, option):
return conv(self.get(section, option))
def getint(self, section, option):
return self._get(section, int, option)
def getfloat(self, section, option):
return self._get(section, float, option)
_boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True,
'0': False, 'no': False, 'false': False, 'off': False}
def getboolean(self, section, option):
v = self.get(section, option)
if v.lower() not in self._boolean_states:
raise ValueError, 'Not a boolean: %s' % v
return self._boolean_states[v.lower()]
def optionxform(self, optionstr):
return optionstr.lower()
def has_option(self, section, option):
"""Check for the existence of a given option in a given section."""
if not section or section == DEFAULTSECT:
option = self.optionxform(option)
return option in self._defaults
elif section not in self._sections:
return False
else:
option = self.optionxform(option)
return (option in self._sections[section]
or option in self._defaults)
def set(self, section, option, value=None):
"""Set an option."""
if not section or section == DEFAULTSECT:
sectdict = self._defaults
else:
try:
sectdict = self._sections[section]
except KeyError:
raise NoSectionError(section)
sectdict[self.optionxform(option)] = value
def write(self, fp):
"""Write an .ini-format representation of the configuration state."""
if self._defaults:
fp.write("[%s]\n" % DEFAULTSECT)
for (key, value) in self._defaults.items():
fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t')))
fp.write("\n")
for section in self._sections:
fp.write("[%s]\n" % section)
for (key, value) in self._sections[section].items():
if key == "__name__":
continue
if (value is not None) or (self._optcre == self.OPTCRE):
key = " = ".join((key, str(value).replace('\n', '\n\t')))
fp.write("%s\n" % (key))
fp.write("\n")
def remove_option(self, section, option):
"""Remove an option."""
if not section or section == DEFAULTSECT:
sectdict = self._defaults
else:
try:
sectdict = self._sections[section]
except KeyError:
raise NoSectionError(section)
option = self.optionxform(option)
existed = option in sectdict
if existed:
del sectdict[option]
return existed
def remove_section(self, section):
"""Remove a file section."""
existed = section in self._sections
if existed:
del self._sections[section]
return existed
#
# Regular expressions for parsing section headers and options.
#
SECTCRE = re.compile(
r'\[' # [
r'(?P<header>[^]]+)' # very permissive!
r'\]' # ]
)
OPTCRE = re.compile(
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
r'\s*(?P<vi>[:=])\s*' # any number of space/tab,
# followed by separator
# (either : or =), followed
# by any # space/tab
r'(?P<value>.*)$' # everything up to eol
)
OPTCRE_NV = re.compile(
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
r'\s*(?:' # any number of space/tab,
r'(?P<vi>[:=])\s*' # optionally followed by
# separator (either : or
# =), followed by any #
# space/tab
r'(?P<value>.*))?$' # everything up to eol
)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
comment_store = {}
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' :
continue
### store comments for doc purposes
### Deal with cases of sections and options being there or not
if line[0] in '#;' and cursect is not None:
if optname is None:
comment_store.setdefault(cursect['__name__'] +
"::" + "global",[]).append(line)
else:
comment_store.setdefault(cursect['__name__'] +
"::" + optname,[]).append(line)
continue
elif line[0] in '#;' and cursect is None:
comment_store.setdefault("global" +
"::" + optname,[]).append(line)
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname].append(value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self._optcre.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
optname = self.optionxform(optname.rstrip())
# This check is fine because the OPTCRE cannot
# match if it would set optval to None
if optval is not None:
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos-1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
cursect[optname] = [optval]
else:
# valueless option handling
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
# join the multi-line values collected while reading
all_sections = [self._defaults]
all_sections.extend(self._sections.values())
for options in all_sections:
for name, val in options.items():
if isinstance(val, list):
options[name] = '\n'.join(val)
self.comment_store = comment_store
def ini_as_rst(self):
"""trivial helper function to putput comment_stroe as rest
.. todo:: write actual doctests with string input
>> p = ConfigParser2.SafeConfigParser()
>> p.read(f)
['/usr/home/pbrian/src/public/configparser2/example.ini']
>> open("/tmp/foo.rst", "w").write(p.ini_as_rst())
"""
outstr = ".. rst version of ini file\n\n"
_cursectname = None
for item in sorted(self.comment_store.keys()):
_sect, _opt = item.split("::")
if _sect != _cursectname:
outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect))
_cursectname = _sect
txt = " ".join(self.comment_store[item])
txt = txt.replace("#", "").replace(";","")
outstr += ":%s: %s" % (_opt, txt)
return outstr
import UserDict as _UserDict
class _Chainmap(_UserDict.DictMixin):
"""Combine multiple mappings for successive lookups.
For example, to emulate Python's normal lookup sequence:
import __builtin__
pylookup = _Chainmap(locals(), globals(), vars(__builtin__))
"""
def __init__(self, *maps):
self._maps = maps
def __getitem__(self, key):
for mapping in self._maps:
try:
return mapping[key]
except KeyError:
pass
raise KeyError(key)
def keys(self):
result = []
seen = set()
for mapping in self._maps:
for key in mapping:
if key not in seen:
result.append(key)
seen.add(key)
return result
class ConfigParser(RawConfigParser):
def get(self, section, option, raw=False, vars=None):
"""Get an option value for a given section.
If `vars' is provided, it must be a dictionary. The option is looked up
in `vars' (if provided), `section', and in `defaults' in that order.
All % interpolations are expanded in the return values, unless the
optional argument `raw' is true. Values for interpolation keys are
looked up in the same manner as the option.
The section DEFAULT is special.
"""
sectiondict = {}
try:
sectiondict = self._sections[section]
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
# Update with the entry specific variables
vardict = {}
if vars:
for key, value in vars.items():
vardict[self.optionxform(key)] = value
d = _Chainmap(vardict, sectiondict, self._defaults)
option = self.optionxform(option)
try:
value = d[option]
except KeyError:
raise NoOptionError(option, section)
if raw or value is None:
return value
else:
return self._interpolate(section, option, value, d)
def items(self, section, raw=False, vars=None):
"""Return a list of tuples with (name, value) for each option
in the section.
All % interpolations are expanded in the return values, based on the
defaults passed into the constructor, unless the optional argument
`raw' is true. Additional substitutions may be provided using the
`vars' argument, which must be a dictionary whose contents overrides
any pre-existing defaults.
The section DEFAULT is special.
"""
d = self._defaults.copy()
try:
d.update(self._sections[section])
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
# Update with the entry specific variables
if vars:
for key, value in vars.items():
d[self.optionxform(key)] = value
options = d.keys()
if "__name__" in options:
options.remove("__name__")
if raw:
return [(option, d[option])
for option in options]
else:
return [(option, self._interpolate(section, option, d[option], d))
for option in options]
def _interpolate(self, section, option, rawval, vars):
# do the string interpolation
value = rawval
depth = MAX_INTERPOLATION_DEPTH
while depth: # Loop through this until it's done
depth -= 1
if value and "%(" in value:
value = self._KEYCRE.sub(self._interpolation_replace, value)
try:
value = value % vars
except KeyError, e:
raise InterpolationMissingOptionError(
option, section, rawval, e.args[0])
else:
break
if value and "%(" in value:
raise InterpolationDepthError(option, section, rawval)
return value
_KEYCRE = re.compile(r"%\(([^)]*)\)s|.")
def _interpolation_replace(self, match):
s = match.group(1)
if s is None:
return match.group()
else:
return "%%(%s)s" % self.optionxform(s)
class SafeConfigParser(ConfigParser):
def _interpolate(self, section, option, rawval, vars):
# do the string interpolation
L = []
self._interpolate_some(option, L, rawval, section, vars, 1)
return ''.join(L)
_interpvar_re = re.compile(r"%\(([^)]+)\)s")
def _interpolate_some(self, option, accum, rest, section, map, depth):
if depth > MAX_INTERPOLATION_DEPTH:
raise InterpolationDepthError(option, section, rest)
while rest:
p = rest.find("%")
if p < 0:
accum.append(rest)
return
if p > 0:
accum.append(rest[:p])
rest = rest[p:]
# p is no longer used
c = rest[1:2]
if c == "%":
accum.append("%")
rest = rest[2:]
elif c == "(":
m = self._interpvar_re.match(rest)
if m is None:
raise InterpolationSyntaxError(option, section,
"bad interpolation variable reference %r" % rest)
var = self.optionxform(m.group(1))
rest = rest[m.end():]
try:
v = map[var]
except KeyError:
raise InterpolationMissingOptionError(
option, section, rest, var)
if "%" in v:
self._interpolate_some(option, accum, v,
section, map, depth + 1)
else:
accum.append(v)
else:
raise InterpolationSyntaxError(
option, section,
"'%%' must be followed by '%%' or '(', found: %r" % (rest,))
def set(self, section, option, value=None):
"""Set an option. Extend ConfigParser.set: check for string values."""
# The only legal non-string value if we allow valueless
# options is None, so we need to check if the value is a
# string if:
# - we do not allow valueless options, or
# - we allow valueless options but the value is not None
if self._optcre is self.OPTCRE or value:
if not isinstance(value, basestring):
raise TypeError("option values must be strings")
if value is not None:
# check for bad percent signs:
# first, replace all "good" interpolations
tmp_value = value.replace('%%', '')
tmp_value = self._interpvar_re.sub('', tmp_value)
# then, check if there's a lone percent sign left
if '%' in tmp_value:
raise ValueError("invalid interpolation syntax in %r at "
"position %d" % (value, tmp_value.find('%')))
ConfigParser.set(self, section, option, value)

@ -1,43 +0,0 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen

@ -1,27 +0,0 @@
Beautiful Soup is made available under the MIT license:
Copyright (c) 2004-2015 Leonard Richardson
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Beautiful Soup incorporates code from the html5lib library, which is
also made available under the MIT license. Copyright (c) 2006-2013
James Graham and other contributors

File diff suppressed because it is too large Load Diff

@ -1,63 +0,0 @@
= Introduction =
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
<body>
<p>
Some
<b>
bad
<i>
HTML
</i>
</b>
</p>
</body>
</html>
>>> soup.find(text="bad")
u'bad'
>>> soup.i
<i>HTML</i>
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
>>> print soup.prettify()
<?xml version="1.0" encoding="utf-8">
<tag1>
Some
<tag2 />
bad
<tag3>
XML
</tag3>
</tag1>
= Full documentation =
The bs4/doc/ directory contains full documentation in Sphinx
format. Run "make html" in that directory to create HTML
documentation.
= Running the unit tests =
Beautiful Soup supports unit test discovery from the project root directory:
$ nosetests
$ python -m unittest discover -s bs4 # Python 2.7 and up
If you checked out the source tree, you should see a script in the
home directory called test-all-versions. This script will run the unit
tests under Python 2.7, then create a temporary Python 3 conversion of
the source and run the unit tests again under Python 3.
= Links =
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://readthedocs.org/docs/beautiful-soup-4/
Discussion group: http://groups.google.com/group/beautifulsoup/
Development: https://code.launchpad.net/beautifulsoup/
Bug tracker: https://bugs.launchpad.net/beautifulsoup/

@ -1,31 +0,0 @@
Additions
---------
More of the jQuery API: nextUntil?
Optimizations
-------------
The html5lib tree builder doesn't use the standard tree-building API,
which worries me and has resulted in a number of bugs.
markup_attr_map can be optimized since it's always a map now.
Upon encountering UTF-16LE data or some other uncommon serialization
of Unicode, UnicodeDammit will convert the data to Unicode, then
encode it at UTF-8. This is wasteful because it will just get decoded
back to Unicode.
CDATA
-----
The elementtree XMLParser has a strip_cdata argument that, when set to
False, should allow Beautiful Soup to preserve CDATA sections instead
of treating them as text. Except it doesn't. (This argument is also
present for HTMLParser, and also does nothing there.)
Currently, htm5lib converts CDATA sections into comments. An
as-yet-unreleased version of html5lib changes the parser's handling of
CDATA sections to allow CDATA sections in tags like <svg> and
<math>. The HTML5TreeBuilder will need to be updated to create CData
objects instead of Comment objects in this situation.

@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__version__ = "4.8.0"
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
import sys
import traceback
import warnings
@ -50,7 +49,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
@ -74,7 +73,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
@ -82,16 +81,56 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
"""Constructor.
:param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
"""
if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@ -142,18 +181,22 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode):
if from_encoding and isinstance(markup, str):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
# specify a parser' warning.
original_builder = builder
original_features = features
if isinstance(features, basestring):
if isinstance(builder, type):
# A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None:
if isinstance(features, str):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
@ -163,41 +206,73 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES):
# At this point either we have a TreeBuilder instance in
# builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None:
builder = builder_class(**kwargs)
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
# This code adapted from warnings.py so that we get the same line
# of code as our warnings.warn() call gets, even if the answer is wrong
# (as it may be in a multithreading situation).
caller = None
try:
caller = sys._getframe(1)
except ValueError:
pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type))
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
self.builder.soup = self
self._namespaces = dict()
self.parse_only = parse_only
self.builder.initialize_soup(self)
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup)
or (isinstance(markup, str) and not '<' in markup)
):
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if (isinstance(markup, unicode)
if (isinstance(markup, str)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
@ -205,13 +280,13 @@ class BeautifulSoup(Tag):
is_file = False
try:
is_file = os.path.exists(possible_filename)
except Exception, e:
except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
if isinstance(markup, unicode):
if isinstance(markup, str):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should'
@ -263,9 +338,9 @@ class BeautifulSoup(Tag):
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode):
space = u' '
cant_start_with = (u"http:", u"https:")
elif isinstance(markup, str):
space = ' '
cant_start_with = ("http:", "https:")
else:
return
@ -302,9 +377,10 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
@ -327,7 +403,7 @@ class BeautifulSoup(Tag):
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
@ -336,7 +412,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString):
if self.current_data:
current_data = u''.join(self.current_data)
current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
@ -366,60 +442,71 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
parent = parent or self.currentTag
previous_element = most_recent_element or self._most_recent_element
if parent is None:
parent = self.currentTag
if most_recent_element is not None:
previous_element = most_recent_element
else:
previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
if not previous_element:
if previous_element is None:
previous_element = o.previous_element
fix = parent.next_element is not None
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o
parent.contents.append(o)
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = len(parent.contents)-1
while index >= 0:
if parent.contents[index] is o:
# Check if we are inserting into an already parsed node.
if fix:
self._linkage_fixer(parent)
def _linkage_fixer(self, el):
"""Make sure linkage of this fragment is sound."""
first = el.contents[0]
child = el.contents[-1]
descendant = child
if child is first and el.parent is not None:
# Parent should be linked to first child
el.next_element = child
# We are no longer linked to whatever this element is
prev_el = child.previous_element
if prev_el is not None and prev_el is not el:
prev_el.next_element = None
# First child should be linked to the parent, and no previous siblings.
child.previous_element = el
child.previous_sibling = None
# We have no sibling as we've been appended as the last.
child.next_sibling = None
# This index is a tag, dig deeper for a "last descendant"
if isinstance(child, Tag) and child.contents:
descendant = child._last_descendant(False)
# As the final step, link last descendant. It should be linked
# to the parent's next sibling (if found), else walk up the chain
# and find a parent with a sibling. It should have no next sibling.
descendant.next_element = None
descendant.next_sibling = None
target = el
while True:
if target is None:
break
index -= 1
else:
raise ValueError(
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % (
o, parent
)
)
if index == 0:
previous_element = parent
previous_sibling = None
else:
previous_element = previous_sibling = parent.contents[index-1]
if index == len(parent.contents)-1:
next_element = parent.next_sibling
next_sibling = None
else:
next_element = next_sibling = parent.contents[index+1]
o.previous_element = previous_element
if previous_element:
previous_element.next_element = o
o.next_element = next_element
if next_element:
next_element.previous_element = o
o.next_sibling = next_sibling
if next_sibling:
next_sibling.previous_sibling = o
o.previous_sibling = previous_sibling
if previous_sibling:
previous_sibling.next_sibling = o
elif target.next_sibling is not None:
descendant.next_element = target.next_sibling
target.next_sibling.previous_element = child
break
target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
@ -465,7 +552,7 @@ class BeautifulSoup(Tag):
self.currentTag, self._most_recent_element)
if tag is None:
return tag
if self._most_recent_element:
if self._most_recent_element is not None:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
@ -490,9 +577,9 @@ class BeautifulSoup(Tag):
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
prefix = ''
if not pretty_print:
indent_level = None
else:
@ -526,4 +613,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()
print(soup.prettify())

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
from collections import defaultdict
import itertools
@ -7,8 +7,7 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
whitespace_re
nonwhitespace_re
)
__all__ = [
@ -90,17 +89,45 @@ class TreeBuilder(object):
is_xml = False
picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
def __init__(self):
USE_DEFAULT = object()
def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags:
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
"""
self.soup = soup
def reset(self):
pass
@ -160,14 +187,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
for attr in attrs.keys():
for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
if isinstance(value, str):
values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from HTML4, removed in HTML5.
'spacer', 'frame'
# These are from earlier versions of HTML and are removed in HTML5.
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
'HTML5TreeBuilder',
@ -15,7 +15,7 @@ from bs4.builder import (
)
from bs4.element import (
NamespacedAttribute,
whitespace_re,
nonwhitespace_re,
)
import html5lib
from html5lib.constants import (
@ -33,7 +33,7 @@ try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError, e:
except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in element.attrs.items():
for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
@ -199,14 +199,14 @@ class AttrList(object):
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes
list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
value = whitespace_re.split(value)
value = nonwhitespace_re.findall(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node):
string_child = child = None
if isinstance(node, basestring):
if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
if not isinstance(child, str) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
if new_parents_last_descendant:
if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant:
if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child:
if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.

@ -1,17 +1,18 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import HTMLParser
from html.parser import HTMLParser
try:
from HTMLParser import HTMLParseError
except ImportError, e:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
@ -65,6 +66,17 @@ class BeautifulSoupHTMLParser(HTMLParser):
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
data = None
if real_name < 256:
# HTML numeric entities are supposed to reference Unicode
# code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError as e:
pass
if not data:
try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
pass
data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
data = "&%s;" % name
# If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs)
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
if isinstance(markup, str):
yield (markup, None, None, False)
return
@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e

@ -1,13 +1,18 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
try:
from collections.abc import Callable # Python 3.6
except ImportError as e:
from collections import Callable
from io import BytesIO
from StringIO import StringIO
import collections
from io import StringIO
from lxml import etree
from bs4.element import (
Comment,
@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
def _invert(d):
"Invert a dictionary."
return dict((v,k) for k, v in list(d.items()))
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
def _register_namespaces(self, mapping):
"""Let the BeautifulSoup object know about namespaces encountered
while parsing the document.
This might be useful later on when creating CSS selectors.
"""
for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
def default_parser(self, encoding):
# This can either return a parser object or a class, which
@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable):
if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
def __init__(self, parser=None, empty_element_tags=None):
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@ -71,7 +102,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode):
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode):
if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(self.nsmaps) > 1:
if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# First, Let the BeautifulSoup object know about it.
self._register_namespaces(nsmap)
# Then, add it to our running list of inverted namespace
# mappings.
self.nsmaps.append(_invert(nsmap))
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment
return '<html><body>%s</body></html>' % fragment

@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import codecs
from htmlentitydefs import codepoint2name
from html.entities import codepoint2name
import re
import logging
import string
@ -46,9 +45,9 @@ except ImportError:
pass
xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@ -58,15 +57,24 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
if codepoint != 34:
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
# entity. We don't want to use it, but we want to recognize it on the way in.
#
# TODO: Ideally we would be able to recognize all HTML 5 named
# entities, but that's a little tricky.
extra = [(39, 'apos')]
for codepoint, name in list(codepoint2name.items()) + extra:
character = chr(codepoint)
if codepoint not in (34, 39):
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
# &quot; or the single quote into &apos;, unless it
# happens within an attribute value, which is handled
# elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
# But we do want to recognize those entities on the way in and
# convert them to Unicode characters.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
@ -82,7 +90,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@ -274,7 +282,7 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
if isinstance(data, unicode):
if isinstance(data, str):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
@ -352,9 +360,9 @@ class UnicodeDammit:
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':
if isinstance(markup, str) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.unicode_markup = str(markup)
self.original_encoding = None
return
@ -438,7 +446,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors)
return str(data, encoding, errors)
@property
def declared_html_encoding(self):

@ -1,12 +1,11 @@
"""Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser
from io import StringIO
from html.parser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
@ -22,8 +21,8 @@ import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
print("Diagnostic running on Beautiful Soup %s" % __version__)
print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@ -32,16 +31,16 @@ def diagnose(data):
break
else:
basic_parsers.remove(name)
print (
print((
"I noticed that %s is not installed. Installing it may help." %
name)
name))
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
basic_parsers.append("lxml-xml")
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
except ImportError, e:
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError as e:
print (
"lxml is not installed or couldn't be imported.")
@ -49,37 +48,43 @@ def diagnose(data):
if 'html5lib' in basic_parsers:
try:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
except ImportError, e:
print("Found html5lib version %s" % html5lib.__version__)
except ImportError as e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
print
else:
try:
if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print()
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
print("Trying to parse your markup with %s" % parser)
success = False
try:
soup = BeautifulSoup(data, parser)
soup = BeautifulSoup(data, features=parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print("Here's what %s did with the markup:" % parser)
print(soup.prettify())
print "-" * 80
print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text))
print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
@ -149,7 +154,7 @@ def rword(length=5):
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
@ -171,9 +176,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):

File diff suppressed because it is too large Load Diff

@ -1,7 +1,7 @@
# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import pickle
@ -16,29 +16,66 @@ from bs4.element import (
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
Tag
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
<div>A <meta> tag</div>
<div>A <br> tag that supposedly has contents.</br></div>
<div>AT&T</div>
<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
<div><a href="http://example.com/</a> that attribute value never got closed</div>
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
<! This document starts with a bogus declaration ><div>a</div>
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
<div>This document ends with <!an incomplete declaration
<div><a style={height:21px;}>That attribute value was bogus</a></div>
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
<div>This document ends before the entity finishes: &gt
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
<div><table><tr><td>Here's a table</td></tr></table></div>
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
<div>This tag contains nothing but whitespace: <b> </b></div>
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
<div><table><div>This table contains bare markup</div></table></div>
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
"""
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder()
return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder.test_fragment_to_document(markup)
return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase):
self.assertEqual(earlier, e.previous_element)
earlier = e
def linkage_validator(self, el, _recursive_call=False):
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert el.previous_element is None,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert el.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert el.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert el.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert child.previous_element is el,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert child.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert child.previous_sibling is el.contents[idx - 1],\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert el.contents[idx - 1].next_sibling is child,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert child.previous_element is last_child,\
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert last_child.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
# A bubbled up descendant should have no next siblings
assert descendant.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert child.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target = el
while True:
if target is None:
assert child.next_element is None, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert child.next_element is target.next_sibling, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_namespaced_html(self):
"""When a namespaced XML document is parsed as HTML it should
be treated as HTML with weird tag names.
"""
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
soup = self.soup(markup)
self.assertEqual(2, len(soup.find_all("ns1:foo")))
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = u"""<?PITarget PIContent?>"""
markup = """<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
@ -292,6 +452,18 @@ Hello, world!
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_multivalued_attribute_with_whitespace(self):
# Whitespace separating the values of a multi-valued attribute
# should be ignored.
markup = '<div class=" foo bar "></a>'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.div['class'])
# If you search by the literal name of the class it's like the whitespace
# wasn't there.
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
@ -311,15 +483,41 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_strings_resembling_character_entity_references(self):
# "&T" and "&p" look like incomplete character entities, but they are
# not.
self.assertSoupEquals(
"<p>&bull; AT&T is in the s&p 500</p>",
"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
def test_apos_entity(self):
self.assertSoupEquals(
"<p>Bob&apos;s Bar</p>",
"<p>Bob's Bar</p>",
)
def test_entities_in_foreign_document_encoding(self):
# &#147; and &#148; are invalid numeric entities referencing
# Windows-1252 characters. &#45; references a character common
# to Windows-1252 and Unicode, and &#9731; references a
# character only found in Unicode.
#
# All of these entities should be converted to Unicode
# characters.
markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
soup = self.soup(markup)
self.assertEqual("“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -330,7 +528,7 @@ Hello, world!
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
@ -408,9 +606,9 @@ Hello, world!
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@ -450,7 +648,7 @@ Hello, world!
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@ -460,15 +658,15 @@ Hello, world!
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@ -477,7 +675,7 @@ Hello, world!
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@ -586,6 +784,13 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<parent xmlns="http://ns1/">
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
</child>
</parent>"""
soup = self.soup(doc)
self.assertEqual(doc, soup.encode())
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
self.assertEqual(str(soup.foo), markup)
def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?>
@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object):
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""

@ -5,7 +5,7 @@ import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
except ImportError as e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
@property
def default_builder(self):
return HTML5TreeBuilder()
return HTML5TreeBuilder
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_foster_parenting(self):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
def test_extraction(self):
"""
Test that extraction does not destroy the tree.
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
"""
markup = """
<html><head></head>
<style>
</style><script></script><body><p>hello</p></body></html>
"""
soup = self.soup(markup)
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
self.assertEqual(len(soup.find_all("p")), 1)
def test_empty_comment(self):
"""
Test that empty comment does not break structure.
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
"""
markup = """
<html>
<body>
<form>
<!----><input type="text">
</form>
</body>
</html>
"""
soup = self.soup(markup)
inputs = []
for form in soup.find_all('form'):
inputs.extend(form.find_all('input'))
self.assertEqual(len(inputs), 1)

@ -5,12 +5,11 @@ from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return HTMLParserTreeBuilder()
default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")

@ -7,7 +7,7 @@ try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return LXMLTreeBuilder()
return LXMLTreeBuilder
def test_out_of_range_entity(self):
self.assertSoupEquals(
@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
# a string like u'\x93'
pass
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b))
self.assertEqual("<b/>", str(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf(
@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return LXMLTreeBuilderForXML()
return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
# We should not track un-prefixed namespaces as we can only hold one
# and it will be recognized as the default namespace by soupsieve,
# which may be confusing in some situations. When no namespace is provided
# for a selector, the default namespace (if defined) is assumed.
soup = self.soup(
'<?xml version="1.1"?>\n'
'<root>'
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
'</root>'
)
self.assertEqual(
soup._namespaces,
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
)

@ -24,6 +24,7 @@ from bs4.dammit import (
EncodingDetector,
)
from bs4.testing import (
default_builder,
SoupTest,
skipIf,
)
@ -32,7 +33,7 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@ -40,20 +41,85 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
data = u"<h1>éé</h1>"
data = "<h1>éé</h1>"
soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string)
self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self):
data = u"<h1>foo\0bar</h1>"
data = "<h1>foo\0bar</h1>"
soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string)
self.assertEqual("foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
utf8_data = u"Räksmörgås".encode("utf-8")
utf8_data = "Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
def test_custom_builder_class(self):
# Verify that you can pass in a custom Builder class and
# it'll be instantiated with the appropriate keyword arguments.
class Mock(object):
def __init__(self, **kwargs):
self.called_with = kwargs
self.is_xml = True
def initialize_soup(self, soup):
pass
def prepare_markup(self, *args, **kwargs):
return ''
kwargs = dict(
var="value",
# This is a deprecated BS3-era keyword argument, which
# will be stripped out.
convertEntities=True,
)
with warnings.catch_warnings(record=True):
soup = BeautifulSoup('', builder=Mock, **kwargs)
assert isinstance(soup.builder, Mock)
self.assertEqual(dict(var="value"), soup.builder.called_with)
# You can also instantiate the TreeBuilder yourself. In this
# case, that specific object is used and any keyword arguments
# to the BeautifulSoup constructor are ignored.
builder = Mock(**kwargs)
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup(
'', builder=builder, ignored_value=True,
)
msg = str(w[0].message)
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
self.assertEqual(builder, soup.builder)
self.assertEqual(kwargs, builder.called_with)
def test_cdata_list_attributes(self):
# Most attribute values are represented as scalars, but the
# HTML standard says that some attributes, like 'class' have
# space-separated lists as values.
markup = '<a id=" an id " class=" a class "></a>'
soup = self.soup(markup)
# Note that the spaces are stripped for 'class' but not for 'id'.
a = soup.a
self.assertEqual(" an id ", a['id'])
self.assertEqual(["a", "class"], a['class'])
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
self.assertEqual(" a class ", soup.a['class'])
# Here are two ways of saying that `id` is a multi-valued
# attribute in this context, but 'class' is not.
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
a = soup.a
self.assertEqual(["an", "id"], a['id'])
self.assertEqual(" a class ", a['class'])
class TestWarnings(SoupTest):
@ -129,7 +195,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/")
soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
@ -141,7 +207,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(u"http://www.crummyuncode.com/ is great")
soup = self.soup("http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar")
"foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest):
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = u"I'm already Unicode! \N{SNOWMAN}"
markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = u"Räksmörgås".encode("utf-8")
utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@ -26,6 +25,7 @@ from bs4.element import (
Comment,
Declaration,
Doctype,
Formatter,
NavigableString,
SoupStrainer,
Tag,
@ -71,13 +71,13 @@ class TestFind(TreeTest):
self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self):
soup = self.soup(u'<h1>Räksmörgås</h1>')
self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
soup = self.soup('<h1>Räksmörgås</h1>')
self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
def test_unicode_attribute_find(self):
soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
str(soup)
self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
def test_find_everything(self):
@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
"""You can search the tree for text nodes."""
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
# Exact match.
self.assertEqual(soup.find_all(string="bar"), [u"bar"])
self.assertEqual(soup.find_all(text="bar"), [u"bar"])
self.assertEqual(soup.find_all(string="bar"), ["bar"])
self.assertEqual(soup.find_all(text="bar"), ["bar"])
# Match any of a number of strings.
self.assertEqual(
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
# Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')),
[u"Foo", u"bar", u'\xbb'])
["Foo", "bar", '\xbb'])
# Match anything.
self.assertEqual(soup.find_all(text=True),
[u"Foo", u"bar", u'\xbb'])
["Foo", "bar", '\xbb'])
def test_find_all_limit(self):
"""You can limit the number of items returned by find_all."""
@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self):
peace = u"םולש".encode("utf8")
data = u'<a title="םולש"></a>'.encode("utf8")
peace = "םולש".encode("utf8")
data = '<a title="םולש"></a>'.encode("utf8")
soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
self.assertEqual([], soup.find_all(id=1, text="bar"))
class TestSmooth(TreeTest):
"""Test Tag.smooth."""
def test_smooth(self):
soup = self.soup("<div>a</div>")
div = soup.div
div.append("b")
div.append("c")
div.append(Comment("Comment 1"))
div.append(Comment("Comment 2"))
div.append("d")
builder = self.default_builder()
span = Tag(soup, builder, 'span')
span.append('1')
span.append('2')
div.append(span)
# At this point the tree has a bunch of adjacent
# NavigableStrings. This is normal, but it has no meaning in
# terms of HTML, so we may want to smooth things out for
# output.
# Since the <span> tag has two children, its .string is None.
self.assertEqual(None, div.span.string)
self.assertEqual(7, len(div.contents))
div.smooth()
self.assertEqual(5, len(div.contents))
# The three strings at the beginning of div.contents have been
# merged into on string.
#
self.assertEqual('abc', div.contents[0])
# The call is recursive -- the <span> tag was also smoothed.
self.assertEqual('12', div.span.string)
# The two comments have _not_ been merged, even though
# comments are strings. Merging comments would change the
# meaning of the HTML.
self.assertEqual('Comment 1', div.contents[1])
self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest):
@ -605,7 +647,7 @@ class SiblingTest(TreeTest):
</html>'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
markup = re.compile("\n\s*").sub("", markup)
markup = re.compile(r"\n\s*").sub("", markup)
self.tree = self.soup(markup)
@ -703,10 +745,10 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
new_tag = soup.new_tag("foo", bar="baz")
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
self.assertEqual(dict(bar="baz"), new_tag.attrs)
self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
def test_tag_inherits_self_closing_rules_from_builder(self):
@ -821,6 +863,26 @@ class TestTreeModification(SoupTest):
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
def test_insert_beautifulsoup_object_inserts_children(self):
"""Inserting one BeautifulSoup object into another actually inserts all
of its children -- you'll never combine BeautifulSoup objects.
"""
soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
text = "<p>p2</p><p>p3</p>"
to_insert = self.soup(text)
soup.insert(1, to_insert)
for i in soup.descendants:
assert not isinstance(i, BeautifulSoup)
p1, p2, p3, p4 = list(soup.children)
self.assertEqual("And now, a word:", p1.string)
self.assertEqual("p2", p2.string)
self.assertEqual("p3", p3.string)
self.assertEqual("And we're back.", p4.string)
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('<p><a>one</a><b>three</b></p>')
a = soup.a
@ -877,7 +939,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self):
builder = self.default_builder
builder = self.default_builder()
soup = self.soup(
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
magic_tag = Tag(soup, builder, 'magictag')
@ -912,6 +974,13 @@ class TestTreeModification(SoupTest):
soup.a.append(soup.b)
self.assertEqual(data, soup.decode())
def test_extend(self):
data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
soup = self.soup(data)
l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
soup.a.extend(l)
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>"
soup = self.soup(data)
@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
# Can't insert an element before itself.
b = soup.b
self.assertRaises(ValueError, b.insert_before, b)
# Can't insert before if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_before, "nope")
# Can insert an identical element
soup = self.soup("<a>")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_before(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_before("BAZ", " ", "QUUX")
soup.a.insert_before("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
soup.a.insert_before(soup.b, "FOO")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
def test_insert_after(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_after("BAZ")
@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
# Can't insert an element after itself.
b = soup.b
self.assertRaises(ValueError, b.insert_after, b)
# Can't insert after if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_after, "nope")
# Can insert an identical element
soup = self.soup("<a>")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_after(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_after("BAZ", " ", "QUUX")
soup.a.insert_after("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
soup.b.insert_after(soup.a, "FOO ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
soup = self.soup("")
tag = soup.new_tag("a")
@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest):
<script>baz</script>
</html>""")
[soup.script.extract() for i in soup.find_all("script")]
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
'.bTag is deprecated, use .find("b") instead.',
'.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
str(w[0].message))
def test_has_attr(self):
@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding
copy = soup.__copy__()
self.assertEqual(u"<p> </p>", unicode(copy))
self.assertEqual("<p> </p>", str(copy))
self.assertEqual(encoding, copy.original_encoding)
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode())
def test_copy_navigablestring_is_not_attached_to_tree(self):
html = u"<b>Foo<a></a></b><b>Bar</b>"
html = "<b>Foo<a></a></b><b>Bar</b>"
soup = self.soup(html)
s1 = soup.find(string="Foo")
s2 = copy.copy(s1)
@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest):
self.assertEqual(None, s2.previous_element)
def test_copy_navigablestring_subclass_has_same_type(self):
html = u"<b><!--Foo--></b>"
html = "<b><!--Foo--></b>"
soup = self.soup(html)
s1 = soup.string
s2 = copy.copy(s1)
@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest):
self.assertTrue(isinstance(s2, Comment))
def test_copy_entire_soup(self):
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
soup_copy = copy.copy(soup)
self.assertEqual(soup, soup_copy)
def test_copy_tag_copies_contents(self):
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
div = soup.div
div_copy = copy.copy(div)
# The two tags look the same, and evaluate to equal.
self.assertEqual(unicode(div), unicode(div_copy))
self.assertEqual(str(div), str(div_copy))
self.assertEqual(div, div_copy)
# But they're not the same object.
@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest):
def test_default_formatter_is_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual(
decoded,
self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html")
self.assertEqual(
decoded,
self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_html5(self):
markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html5")
self.assertEqual(
decoded,
self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual(
decoded,
self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_null(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter=None)
# Neither the angle brackets nor the e-with-acute are converted.
# This is not valid HTML, but it's what the user wanted.
self.assertEqual(decoded,
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self):
markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom
# callable is called on every string.
self.assertEqual(
decoded,
self.document_for(u"<b><FOO></b><b>BAR</b>"))
self.document_for("<b><FOO></b><b>BAR</b><br/>"))
def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
markup = '<a href="http://a.com?a=b&c=é">e</a>'
soup = self.soup(markup)
a = soup.a
expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
self.assertEqual(expect_minimal, a.decode())
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
self.assertEqual(expect_html, a.decode(formatter="html"))
self.assertEqual(markup, a.decode(formatter=None))
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self):
@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest):
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
soup.div.prettify())
def test_prettify_accepts_formatter(self):
def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>")
self.assertEqual(unicode, type(soup.prettify()))
self.assertEqual(str, type(soup.prettify()))
def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>")
self.assertEqual(bytes, type(soup.prettify("utf-8")))
def test_html_entity_substitution_off_by_default(self):
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
soup = self.soup(markup)
encoded = soup.b.encode("utf-8")
self.assertEqual(encoded, markup.encode('utf-8'))
@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(soup.b.string.encode("utf-8"),
u"\N{SNOWMAN}".encode("utf-8"))
"\N{SNOWMAN}".encode("utf-8"))
def test_tag_containing_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8"))
def test_encoding_substitutes_unrecognized_characters_by_default(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
def test_encoding_can_be_made_strict(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertRaises(
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
def test_decode_contents(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
def test_encode_contents(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8"))
def test_deprecated_renderContents(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
def test_repr(self):
html = u"<b>\N{SNOWMAN}</b>"
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
if PY3K:
self.assertEqual(html, repr(soup))
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
class TestFormatter(SoupTest):
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
# e.g., disable the normal sorting of attributes.
class UnsortedFormatter(Formatter):
def attributes(self, tag):
self.called_with = tag
for k, v in sorted(tag.attrs.items()):
if k == 'ignore':
continue
yield k,v
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
# attributes() was called on the <p> tag. It filtered out one
# attribute and sorted the other two.
self.assertEqual(formatter.called_with, soup.p)
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self):
@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('title')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'title')
self.assertEqual(els[0].contents, [u'The title'])
self.assertEqual(els[0].contents, ['The title'])
def test_one_tag_many(self):
els = self.soup.select('div')
@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
self.assertRaises(ValueError, self.soup.select, 'tag%t')
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(dashed[0]['id'], 'dash2')
def test_dashed_tag_text(self):
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
def test_select_dashed_matches_find_all(self):
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
NotImplementedError, self.soup.select, "a:nth-of-type(a)")
SyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
self.assertEqual(els[0].string, 'Some text')
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another')
self.assertEqual(els[0].string, 'Another')
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
# Pass in an invalid value.
self.assertRaises(
ValueError, self.soup.select, 'div p:nth-of-type(0)')
# Zero will select no tags.
els = self.soup.select('div p:nth-of-type(0)')
self.assertEqual(len(els), 0)
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
self.assertEqual(els[0].string, 'Some text')
def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
self.assertRaises(ValueError, self.soup.select, 'h1 >')
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
self.assertRaises(ValueError, self.soup.select, ',x, y')
self.assertRaises(ValueError, self.soup.select, 'x,,y')
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
def test_select_duplicate_elements(self):
# When markup contains duplicate elements, a multiple select
# will find all of them.
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
soup = BeautifulSoup(markup, 'html.parser')
selected = soup.select(".c1, .c2")
self.assertEqual(3, len(selected))
# Verify that find_all finds the same elements, though because
# of an implementation detail it finds them in a different
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected

@ -1,3 +0,0 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)

@ -1,23 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Execute computations asynchronously using threads or processes."""
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
from concurrent.futures._base import (FIRST_COMPLETED,
FIRST_EXCEPTION,
ALL_COMPLETED,
CancelledError,
TimeoutError,
Future,
Executor,
wait,
as_completed)
from concurrent.futures.thread import ThreadPoolExecutor
try:
from concurrent.futures.process import ProcessPoolExecutor
except ImportError:
# some platforms don't have multiprocessing
pass

@ -1,607 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
import collections
import logging
import threading
import itertools
import time
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
FIRST_COMPLETED = 'FIRST_COMPLETED'
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
ALL_COMPLETED = 'ALL_COMPLETED'
_AS_COMPLETED = '_AS_COMPLETED'
# Possible future states (for internal use by the futures package).
PENDING = 'PENDING'
RUNNING = 'RUNNING'
# The future was cancelled by the user...
CANCELLED = 'CANCELLED'
# ...and _Waiter.add_cancelled() was called by a worker.
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
FINISHED = 'FINISHED'
_FUTURE_STATES = [
PENDING,
RUNNING,
CANCELLED,
CANCELLED_AND_NOTIFIED,
FINISHED
]
_STATE_TO_DESCRIPTION_MAP = {
PENDING: "pending",
RUNNING: "running",
CANCELLED: "cancelled",
CANCELLED_AND_NOTIFIED: "cancelled",
FINISHED: "finished"
}
# Logger for internal use by the futures package.
LOGGER = logging.getLogger("concurrent.futures")
class Error(Exception):
"""Base class for all future-related exceptions."""
pass
class CancelledError(Error):
"""The Future was cancelled."""
pass
class TimeoutError(Error):
"""The operation exceeded the given deadline."""
pass
class _Waiter(object):
"""Provides the event that wait() and as_completed() block on."""
def __init__(self):
self.event = threading.Event()
self.finished_futures = []
def add_result(self, future):
self.finished_futures.append(future)
def add_exception(self, future):
self.finished_futures.append(future)
def add_cancelled(self, future):
self.finished_futures.append(future)
class _AsCompletedWaiter(_Waiter):
"""Used by as_completed()."""
def __init__(self):
super(_AsCompletedWaiter, self).__init__()
self.lock = threading.Lock()
def add_result(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _FirstCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_COMPLETED)."""
def add_result(self, future):
super(_FirstCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
super(_FirstCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
super(_FirstCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _AllCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
def __init__(self, num_pending_calls, stop_on_exception):
self.num_pending_calls = num_pending_calls
self.stop_on_exception = stop_on_exception
self.lock = threading.Lock()
super(_AllCompletedWaiter, self).__init__()
def _decrement_pending_calls(self):
with self.lock:
self.num_pending_calls -= 1
if not self.num_pending_calls:
self.event.set()
def add_result(self, future):
super(_AllCompletedWaiter, self).add_result(future)
self._decrement_pending_calls()
def add_exception(self, future):
super(_AllCompletedWaiter, self).add_exception(future)
if self.stop_on_exception:
self.event.set()
else:
self._decrement_pending_calls()
def add_cancelled(self, future):
super(_AllCompletedWaiter, self).add_cancelled(future)
self._decrement_pending_calls()
class _AcquireFutures(object):
"""A context manager that does an ordered acquire of Future conditions."""
def __init__(self, futures):
self.futures = sorted(futures, key=id)
def __enter__(self):
for future in self.futures:
future._condition.acquire()
def __exit__(self, *args):
for future in self.futures:
future._condition.release()
def _create_and_install_waiters(fs, return_when):
if return_when == _AS_COMPLETED:
waiter = _AsCompletedWaiter()
elif return_when == FIRST_COMPLETED:
waiter = _FirstCompletedWaiter()
else:
pending_count = sum(
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
if return_when == FIRST_EXCEPTION:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
elif return_when == ALL_COMPLETED:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
else:
raise ValueError("Invalid return condition: %r" % return_when)
for f in fs:
f._waiters.append(waiter)
return waiter
def as_completed(fs, timeout=None):
"""An iterator over the given futures that yields each as it completes.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
iterate over.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator that yields the given Futures as they complete (finished or
cancelled). If any given Futures are duplicated, they will be returned
once.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
"""
if timeout is not None:
end_time = timeout + time.time()
fs = set(fs)
with _AcquireFutures(fs):
finished = set(
f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
pending = fs - finished
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
try:
for future in finished:
yield future
while pending:
if timeout is None:
wait_timeout = None
else:
wait_timeout = end_time - time.time()
if wait_timeout < 0:
raise TimeoutError(
'%d (of %d) futures unfinished' % (
len(pending), len(fs)))
waiter.event.wait(wait_timeout)
with waiter.lock:
finished = waiter.finished_futures
waiter.finished_futures = []
waiter.event.clear()
for future in finished:
yield future
pending.remove(future)
finally:
for f in fs:
with f._condition:
f._waiters.remove(waiter)
DoneAndNotDoneFutures = collections.namedtuple(
'DoneAndNotDoneFutures', 'done not_done')
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
"""Wait for the futures in the given sequence to complete.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
wait upon.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
return_when: Indicates when this function should return. The options
are:
FIRST_COMPLETED - Return when any future finishes or is
cancelled.
FIRST_EXCEPTION - Return when any future finishes by raising an
exception. If no future raises an exception
then it is equivalent to ALL_COMPLETED.
ALL_COMPLETED - Return when all futures finish or are cancelled.
Returns:
A named 2-tuple of sets. The first set, named 'done', contains the
futures that completed (is finished or cancelled) before the wait
completed. The second set, named 'not_done', contains uncompleted
futures.
"""
with _AcquireFutures(fs):
done = set(f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
not_done = set(fs) - done
if (return_when == FIRST_COMPLETED) and done:
return DoneAndNotDoneFutures(done, not_done)
elif (return_when == FIRST_EXCEPTION) and done:
if any(f for f in done
if not f.cancelled() and f.exception() is not None):
return DoneAndNotDoneFutures(done, not_done)
if len(done) == len(fs):
return DoneAndNotDoneFutures(done, not_done)
waiter = _create_and_install_waiters(fs, return_when)
waiter.event.wait(timeout)
for f in fs:
with f._condition:
f._waiters.remove(waiter)
done.update(waiter.finished_futures)
return DoneAndNotDoneFutures(done, set(fs) - done)
class Future(object):
"""Represents the result of an asynchronous computation."""
def __init__(self):
"""Initializes the future. Should not be called by clients."""
self._condition = threading.Condition()
self._state = PENDING
self._result = None
self._exception = None
self._traceback = None
self._waiters = []
self._done_callbacks = []
def _invoke_callbacks(self):
for callback in self._done_callbacks:
try:
callback(self)
except Exception:
LOGGER.exception('exception calling callback for %r', self)
def __repr__(self):
with self._condition:
if self._state == FINISHED:
if self._exception:
return '<Future at %s state=%s raised %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._exception.__class__.__name__)
else:
return '<Future at %s state=%s returned %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._result.__class__.__name__)
return '<Future at %s state=%s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state])
def cancel(self):
"""Cancel the future if possible.
Returns True if the future was cancelled, False otherwise. A future
cannot be cancelled if it is running or has already completed.
"""
with self._condition:
if self._state in [RUNNING, FINISHED]:
return False
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
return True
self._state = CANCELLED
self._condition.notify_all()
self._invoke_callbacks()
return True
def cancelled(self):
"""Return True if the future has cancelled."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
def running(self):
"""Return True if the future is currently executing."""
with self._condition:
return self._state == RUNNING
def done(self):
"""Return True of the future was cancelled or finished executing."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
def __get_result(self):
if self._exception:
raise type(self._exception), self._exception, self._traceback
else:
return self._result
def add_done_callback(self, fn):
"""Attaches a callable that will be called when the future finishes.
Args:
fn: A callable that will be called with this future as its only
argument when the future completes or is cancelled. The callable
will always be called by a thread in the same process in which
it was added. If the future has already completed or been
cancelled then the callable will be called immediately. These
callables are called in the order that they were added.
"""
with self._condition:
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
self._done_callbacks.append(fn)
return
fn(self)
def result(self, timeout=None):
"""Return the result of the call that the future represents.
Args:
timeout: The number of seconds to wait for the result if the future
isn't done. If None, then there is no limit on the wait time.
Returns:
The result of the call that the future represents.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
Exception: If the call raised then that exception will be raised.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
else:
raise TimeoutError()
def exception_info(self, timeout=None):
"""Return a tuple of (exception, traceback) raised by the call that the
future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
else:
raise TimeoutError()
def exception(self, timeout=None):
"""Return the exception raised by the call that the future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
return self.exception_info(timeout)[0]
# The following methods should only be used by Executors and in tests.
def set_running_or_notify_cancel(self):
"""Mark the future as running or process any cancel notifications.
Should only be used by Executor implementations and unit tests.
If the future has been cancelled (cancel() was called and returned
True) then any threads waiting on the future completing (though calls
to as_completed() or wait()) are notified and False is returned.
If the future was not cancelled then it is put in the running state
(future calls to running() will return True) and True is returned.
This method should be called by Executor implementations before
executing the work associated with this future. If this method returns
False then the work should not be executed.
Returns:
False if the Future was cancelled, True otherwise.
Raises:
RuntimeError: if this method was already called or if set_result()
or set_exception() was called.
"""
with self._condition:
if self._state == CANCELLED:
self._state = CANCELLED_AND_NOTIFIED
for waiter in self._waiters:
waiter.add_cancelled(self)
# self._condition.notify_all() is not necessary because
# self.cancel() triggers a notification.
return False
elif self._state == PENDING:
self._state = RUNNING
return True
else:
LOGGER.critical('Future %s in unexpected state: %s',
id(self),
self._state)
raise RuntimeError('Future in unexpected state')
def set_result(self, result):
"""Sets the return value of work associated with the future.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._result = result
self._state = FINISHED
for waiter in self._waiters:
waiter.add_result(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception_info(self, exception, traceback):
"""Sets the result of the future as being the given exception
and traceback.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._exception = exception
self._traceback = traceback
self._state = FINISHED
for waiter in self._waiters:
waiter.add_exception(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception(self, exception):
"""Sets the result of the future as being the given exception.
Should only be used by Executor implementations and unit tests.
"""
self.set_exception_info(exception, None)
class Executor(object):
"""This is an abstract base class for concrete asynchronous executors."""
def submit(self, fn, *args, **kwargs):
"""Submits a callable to be executed with the given arguments.
Schedules the callable to be executed as fn(*args, **kwargs) and returns
a Future instance representing the execution of the callable.
Returns:
A Future representing the given call.
"""
raise NotImplementedError()
def map(self, fn, *iterables, **kwargs):
"""Returns a iterator equivalent to map(fn, iter).
Args:
fn: A callable that will take as many arguments as there are
passed iterables.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator equivalent to: map(func, *iterables) but the calls may
be evaluated out-of-order.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
Exception: If fn(*args) raises for any values.
"""
timeout = kwargs.get('timeout')
if timeout is not None:
end_time = timeout + time.time()
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
# Yield must be hidden in closure so that the futures are submitted
# before the first iterator value is required.
def result_iterator():
try:
for future in fs:
if timeout is None:
yield future.result()
else:
yield future.result(end_time - time.time())
finally:
for future in fs:
future.cancel()
return result_iterator()
def shutdown(self, wait=True):
"""Clean-up the resources associated with the Executor.
It is safe to call this method several times. Otherwise, no other
methods can be called after this one.
Args:
wait: If True then shutdown will not return until all running
futures have finished executing and the resources used by the
executor have been reclaimed.
"""
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.shutdown(wait=True)
return False

@ -1,359 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ProcessPoolExecutor.
The follow diagram and text describe the data-flow through the system:
|======================= In-process =====================|== Out-of-process ==|
+----------+ +----------+ +--------+ +-----------+ +---------+
| | => | Work Ids | => | | => | Call Q | => | |
| | +----------+ | | +-----------+ | |
| | | ... | | | | ... | | |
| | | 6 | | | | 5, call() | | |
| | | 7 | | | | ... | | |
| Process | | ... | | Local | +-----------+ | Process |
| Pool | +----------+ | Worker | | #1..n |
| Executor | | Thread | | |
| | +----------- + | | +-----------+ | |
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
| | +------------+ | | +-----------+ | |
| | | 6: call() | | | | ... | | |
| | | future | | | | 4, result | | |
| | | ... | | | | 3, except | | |
+----------+ +------------+ +--------+ +-----------+ +---------+
Executor.submit() called:
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
- adds the id of the _WorkItem to the "Work Ids" queue
Local worker thread:
- reads work ids from the "Work Ids" queue and looks up the corresponding
WorkItem from the "Work Items" dict: if the work item has been cancelled then
it is simply removed from the dict, otherwise it is repackaged as a
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
- reads _ResultItems from "Result Q", updates the future stored in the
"Work Items" dict and deletes the dict entry
Process #1..n:
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
_ResultItems in "Request Q"
"""
import atexit
from concurrent.futures import _base
import Queue as queue
import multiprocessing
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads and processes. This is done to allow the
# interpreter to exit when there are still idle processes in a
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
# allowing workers to die with the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads/processes finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
# Controls how many more calls than processes will be queued in the call queue.
# A smaller number will mean that processes spend more time idle waiting for
# work while a larger number will make Future.cancel() succeed less frequently
# (Futures in the call queue cannot be cancelled).
EXTRA_QUEUED_CALLS = 1
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
class _ResultItem(object):
def __init__(self, work_id, exception=None, result=None):
self.work_id = work_id
self.exception = exception
self.result = result
class _CallItem(object):
def __init__(self, work_id, fn, args, kwargs):
self.work_id = work_id
self.fn = fn
self.args = args
self.kwargs = kwargs
def _process_worker(call_queue, result_queue):
"""Evaluates calls from call_queue and places the results in result_queue.
This worker is run in a separate process.
Args:
call_queue: A multiprocessing.Queue of _CallItems that will be read and
evaluated by the worker.
result_queue: A multiprocessing.Queue of _ResultItems that will written
to by the worker.
shutdown: A multiprocessing.Event that will be set as a signal to the
worker that it should exit when call_queue is empty.
"""
while True:
call_item = call_queue.get(block=True)
if call_item is None:
# Wake up queue management thread
result_queue.put(None)
return
try:
r = call_item.fn(*call_item.args, **call_item.kwargs)
except BaseException:
e = sys.exc_info()[1]
result_queue.put(_ResultItem(call_item.work_id,
exception=e))
else:
result_queue.put(_ResultItem(call_item.work_id,
result=r))
def _add_call_item_to_queue(pending_work_items,
work_ids,
call_queue):
"""Fills call_queue with _WorkItems from pending_work_items.
This function never blocks.
Args:
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
are consumed and the corresponding _WorkItems from
pending_work_items are transformed into _CallItems and put in
call_queue.
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems.
"""
while True:
if call_queue.full():
return
try:
work_id = work_ids.get(block=False)
except queue.Empty:
return
else:
work_item = pending_work_items[work_id]
if work_item.future.set_running_or_notify_cancel():
call_queue.put(_CallItem(work_id,
work_item.fn,
work_item.args,
work_item.kwargs),
block=True)
else:
del pending_work_items[work_id]
continue
def _queue_management_worker(executor_reference,
processes,
pending_work_items,
work_ids_queue,
call_queue,
result_queue):
"""Manages the communication between this process and the worker processes.
This function is run in a local thread.
Args:
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
this thread. Used to determine if the ProcessPoolExecutor has been
garbage collected and that this function can exit.
process: A list of the multiprocessing.Process instances used as
workers.
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems for processing by the process workers.
result_queue: A multiprocessing.Queue of _ResultItems generated by the
process workers.
"""
nb_shutdown_processes = [0]
def shutdown_one_process():
"""Tell a worker to terminate, which will in turn wake us again"""
call_queue.put(None)
nb_shutdown_processes[0] += 1
while True:
_add_call_item_to_queue(pending_work_items,
work_ids_queue,
call_queue)
result_item = result_queue.get(block=True)
if result_item is not None:
work_item = pending_work_items[result_item.work_id]
del pending_work_items[result_item.work_id]
if result_item.exception:
work_item.future.set_exception(result_item.exception)
else:
work_item.future.set_result(result_item.result)
# Delete references to object. See issue16284
del work_item
# Check whether we should start shutting down.
executor = executor_reference()
# No more work items can be added if:
# - The interpreter is shutting down OR
# - The executor that owns this worker has been collected OR
# - The executor that owns this worker has been shutdown.
if _shutdown or executor is None or executor._shutdown_thread:
# Since no new work items can be added, it is safe to shutdown
# this thread if there are no pending work items.
if not pending_work_items:
while nb_shutdown_processes[0] < len(processes):
shutdown_one_process()
# If .join() is not called on the created processes then
# some multiprocessing.Queue methods may deadlock on Mac OS
# X.
for p in processes:
p.join()
call_queue.close()
return
del executor
_system_limits_checked = False
_system_limited = None
def _check_system_limits():
global _system_limits_checked, _system_limited
if _system_limits_checked:
if _system_limited:
raise NotImplementedError(_system_limited)
_system_limits_checked = True
try:
import os
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
except (AttributeError, ValueError):
# sysconf not available or setting not available
return
if nsems_max == -1:
# indetermine limit, assume that limit is determined
# by available memory only
return
if nsems_max >= 256:
# minimum number of semaphores available
# according to POSIX
return
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
raise NotImplementedError(_system_limited)
class ProcessPoolExecutor(_base.Executor):
def __init__(self, max_workers=None):
"""Initializes a new ProcessPoolExecutor instance.
Args:
max_workers: The maximum number of processes that can be used to
execute the given calls. If None or not given then as many
worker processes will be created as the machine has processors.
"""
_check_system_limits()
if max_workers is None:
self._max_workers = multiprocessing.cpu_count()
else:
self._max_workers = max_workers
# Make the call queue slightly larger than the number of processes to
# prevent the worker processes from idling. But don't make it too big
# because futures in the call queue cannot be cancelled.
self._call_queue = multiprocessing.Queue(self._max_workers +
EXTRA_QUEUED_CALLS)
self._result_queue = multiprocessing.Queue()
self._work_ids = queue.Queue()
self._queue_management_thread = None
self._processes = set()
# Shutdown is a two-step process.
self._shutdown_thread = False
self._shutdown_lock = threading.Lock()
self._queue_count = 0
self._pending_work_items = {}
def _start_queue_management_thread(self):
# When the executor gets lost, the weakref callback will wake up
# the queue management thread.
def weakref_cb(_, q=self._result_queue):
q.put(None)
if self._queue_management_thread is None:
self._queue_management_thread = threading.Thread(
target=_queue_management_worker,
args=(weakref.ref(self, weakref_cb),
self._processes,
self._pending_work_items,
self._work_ids,
self._call_queue,
self._result_queue))
self._queue_management_thread.daemon = True
self._queue_management_thread.start()
_threads_queues[self._queue_management_thread] = self._result_queue
def _adjust_process_count(self):
for _ in range(len(self._processes), self._max_workers):
p = multiprocessing.Process(
target=_process_worker,
args=(self._call_queue,
self._result_queue))
p.start()
self._processes.add(p)
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown_thread:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._pending_work_items[self._queue_count] = w
self._work_ids.put(self._queue_count)
self._queue_count += 1
# Wake up queue management thread
self._result_queue.put(None)
self._start_queue_management_thread()
self._adjust_process_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown_thread = True
if self._queue_management_thread:
# Wake up queue management thread
self._result_queue.put(None)
if wait:
self._queue_management_thread.join(sys.maxint)
# To reduce the risk of openning too many files, remove references to
# objects that use file descriptors.
self._queue_management_thread = None
self._call_queue = None
self._result_queue = None
self._processes = None
shutdown.__doc__ = _base.Executor.shutdown.__doc__
atexit.register(_python_exit)

@ -1,134 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ThreadPoolExecutor."""
import atexit
from concurrent.futures import _base
import Queue as queue
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads. This is done to allow the interpreter
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
# pool (i.e. shutdown() was not called). However, allowing workers to die with
# the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
atexit.register(_python_exit)
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
def run(self):
if not self.future.set_running_or_notify_cancel():
return
try:
result = self.fn(*self.args, **self.kwargs)
except BaseException:
e, tb = sys.exc_info()[1:]
self.future.set_exception_info(e, tb)
else:
self.future.set_result(result)
def _worker(executor_reference, work_queue):
try:
while True:
work_item = work_queue.get(block=True)
if work_item is not None:
work_item.run()
# Delete references to object. See issue16284
del work_item
continue
executor = executor_reference()
# Exit if:
# - The interpreter is shutting down OR
# - The executor that owns the worker has been collected OR
# - The executor that owns the worker has been shutdown.
if _shutdown or executor is None or executor._shutdown:
# Notice other workers
work_queue.put(None)
return
del executor
except BaseException:
_base.LOGGER.critical('Exception in worker', exc_info=True)
class ThreadPoolExecutor(_base.Executor):
def __init__(self, max_workers):
"""Initializes a new ThreadPoolExecutor instance.
Args:
max_workers: The maximum number of threads that can be used to
execute the given calls.
"""
self._max_workers = max_workers
self._work_queue = queue.Queue()
self._threads = set()
self._shutdown = False
self._shutdown_lock = threading.Lock()
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._work_queue.put(w)
self._adjust_thread_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def _adjust_thread_count(self):
# When the executor gets lost, the weakref callback will wake up
# the worker threads.
def weakref_cb(_, q=self._work_queue):
q.put(None)
# TODO(bquinlan): Should avoid creating new threads if there are more
# idle threads than items in the work queue.
if len(self._threads) < self._max_workers:
t = threading.Thread(target=_worker,
args=(weakref.ref(self, weakref_cb),
self._work_queue))
t.daemon = True
t.start()
self._threads.add(t)
_threads_queues[t] = self._work_queue
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown = True
self._work_queue.put(None)
if wait:
for t in self._threads:
t.join(sys.maxint)
shutdown.__doc__ = _base.Executor.shutdown.__doc__

@ -1,73 +1,6 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""Death by Captcha HTTP and socket API clients.
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
socket ones. Both offer the same functionalily, with the socket API
sporting faster responses and using way less connections.
To access the socket API, use SocketClient class; for the HTTP API, use
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
connection opened and serializes all API requests sent through it, thus
it is advised to keep a pool of them if you're script is heavily
multithreaded.
Both SocketClient and HttpClient give you the following methods:
get_user()
Returns your DBC account details as a dict with the following keys:
"user": your account numeric ID; if login fails, it will be the only
item with the value of 0;
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
solved CAPTCHA in US cents;
"balance": your DBC account balance in US cents;
"is_banned": flag indicating whether your account is suspended or not.
get_balance()
Returns your DBC account balance in US cents.
get_captcha(cid)
Returns an uploaded CAPTCHA details as a dict with the following keys:
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
be the only item with the value of 0;
"text": the CAPTCHA text, if solved, otherwise None;
"is_correct": flag indicating whether the CAPTCHA was solved correctly
(DBC can detect that in rare cases).
The only argument `cid` is the CAPTCHA numeric ID.
get_text(cid)
Returns an uploaded CAPTCHA text (None if not solved). The only argument
`cid` is the CAPTCHA numeric ID.
report(cid)
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
CAPTCHA numeric ID. Returns True on success, False otherwise.
upload(captcha)
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
object (any object with `read` method defined, actually, so StringIO
will do), or CAPTCHA image file name. On successul upload you'll get
the CAPTCHA details dict (see get_captcha() method).
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
to poll for its status periodically using get_captcha() or get_text()
method until the CAPTCHA is solved and you get the text.
decode(captcha, timeout=DEFAULT_TIMEOUT)
A convenient method that uploads a CAPTCHA and polls for its status
periodically, but no longer than `timeout` (defaults to 60 seconds).
If solved, you'll get the CAPTCHA details dict (see get_captcha()
method for details). See upload() method for details on `captcha`
argument.
Visit http://www.deathbycaptcha.com/user/api for updates.
"""
import base64
import binascii
import errno
@ -79,8 +12,7 @@ import socket
import sys
import threading
import time
import urllib
import urllib2
try:
from json import read as json_decode, write as json_encode
except ImportError:
@ -89,64 +21,71 @@ except ImportError:
except ImportError:
from simplejson import loads as json_decode, dumps as json_encode
try:
from urllib2 import build_opener, HTTPRedirectHandler, Request, HTTPError
from urllib import urlencode, urlopen
except ImportError:
from urllib.request import build_opener, HTTPRedirectHandler, Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlencode
# API version and unique software ID
API_VERSION = 'DBC/Python v4.6'
API_VERSION = 'DBC/Python v4.0.11'
SOFTWARE_VENDOR_ID = 0
# Default CAPTCHA timeout and decode() polling interval
DEFAULT_TIMEOUT = 60
DEFAULT_TOKEN_TIMEOUT = 120
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
DFLT_POLL_INTERVAL = 3
POLLS_INTERVAL = 5
# Base HTTP API url
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
HTTP_BASE_URL = 'http://api.deathbycaptcha.com/api'
# Preferred HTTP API server's response content type, do not change
HTTP_RESPONSE_TYPE = 'application/json'
# Socket API server's host & ports range
SOCKET_HOST = 'api.dbcapi.me'
SOCKET_HOST = 'api.deathbycaptcha.com'
SOCKET_PORTS = range(8123, 8131)
def _load_image(captcha):
if hasattr(captcha, 'read'):
img = captcha.read()
elif type(captcha) == bytearray:
img = captcha
else:
img = ''
try:
captcha_file = open(captcha, 'rb')
except Exception:
raise
else:
img = captcha_file.read()
captcha_file.close()
if not len(img):
raise ValueError('CAPTCHA image is empty')
elif imghdr.what(None, img) is None:
raise TypeError('Unknown CAPTCHA image type')
else:
return img
class AccessDeniedException(Exception):
pass
class Client(object):
"""Death by Captcha API Client."""
"""Death by Captcha API Client"""
def __init__(self, username, password):
self.is_verbose = False
self.userpwd = {'username': username, 'password': password}
self.userpwd = {'username': username,
'password': password}
def _load_file(self, captcha):
if hasattr(captcha, 'read'):
raw_captcha = captcha.read()
elif isinstance(captcha, bytearray):
raw_captcha = captcha
elif os.path.isfile(captcha):
raw_captcha = ''
try:
f = open(captcha, 'rb')
except Exception as e:
raise e
else:
raw_captcha = f.read()
f.close()
else:
f_stream = urlopen(captcha)
raw_captcha = f_stream.read()
if not len(raw_captcha):
raise ValueError('CAPTCHA image is empty')
elif imghdr.what(None, raw_captcha) is None:
raise TypeError('Unknown CAPTCHA image type')
else:
return raw_captcha
def _log(self, cmd, msg=''):
if self.is_verbose:
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
print('%d %s %s' % (time.time(), cmd, msg.rstrip()))
return self
def close(self):
@ -156,16 +95,16 @@ class Client(object):
pass
def get_user(self):
"""Fetch user details -- ID, balance, rate and banned status."""
raise NotImplementedError()
"""Fetch the user's details dict -- balance, rate and banned status."""
raise NotImplemented()
def get_balance(self):
"""Fetch user balance (in US cents)."""
"""Fetch the user's balance (in US cents)."""
return self.get_user().get('balance')
def get_captcha(self, cid):
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
raise NotImplementedError()
"""Fetch a CAPTCHA details dict -- its ID, text and correctness."""
raise NotImplemented()
def get_text(self, cid):
"""Fetch a CAPTCHA text."""
@ -173,7 +112,11 @@ class Client(object):
def report(self, cid):
"""Report a CAPTCHA as incorrectly solved."""
raise NotImplementedError()
raise NotImplemented()
def remove(self, cid):
"""Remove an unsolved CAPTCHA."""
raise NotImplemented()
def upload(self, captcha):
"""Upload a CAPTCHA.
@ -182,56 +125,32 @@ class Client(object):
dict on success.
"""
raise NotImplementedError()
raise NotImplemented()
def decode(self, captcha=None, timeout=None, **kwargs):
"""
Try to solve a CAPTCHA.
def decode(self, captcha, timeout=DEFAULT_TIMEOUT):
"""Try to solve a CAPTCHA.
See Client.upload() for arguments details.
Uploads a CAPTCHA, polls for its status periodically with arbitrary
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
"""
if not timeout:
if not captcha:
timeout = DEFAULT_TOKEN_TIMEOUT
else:
timeout = DEFAULT_TIMEOUT
"""
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
uploaded_captcha = self.upload(captcha, **kwargs)
if uploaded_captcha:
intvl_idx = 0 # POLL_INTERVAL index
while deadline > time.time() and not uploaded_captcha.get('text'):
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
time.sleep(intvl)
pulled = self.get_captcha(uploaded_captcha['captcha'])
if pulled['captcha'] == uploaded_captcha['captcha']:
uploaded_captcha = pulled
if uploaded_captcha.get('text') and \
uploaded_captcha.get('is_correct'):
return uploaded_captcha
def _get_poll_interval(self, idx):
"""Returns poll interval and next index depending on index provided"""
if len(POLLS_INTERVAL) > idx:
intvl = POLLS_INTERVAL[idx]
else:
intvl = DFLT_POLL_INTERVAL
idx += 1
return intvl, idx
c = self.upload(captcha)
if c:
while deadline > time.time() and not c.get('text'):
time.sleep(POLLS_INTERVAL)
c = self.get_captcha(c['captcha'])
if c.get('text') and c.get('is_correct'):
return c
class HttpClient(Client):
"""Death by Captcha HTTP API client."""
def __init__(self, *args):
Client.__init__(self, *args)
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
self.opener = build_opener(HTTPRedirectHandler())
def _call(self, cmd, payload=None, headers=None):
if headers is None:
@ -239,30 +158,22 @@ class HttpClient(Client):
headers['Accept'] = HTTP_RESPONSE_TYPE
headers['User-Agent'] = API_VERSION
if hasattr(payload, 'items'):
payload = urllib.urlencode(payload)
payload = urlencode(payload)
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
else:
self._log('SEND', '%s' % cmd)
if payload is not None:
headers['Content-Length'] = len(payload)
try:
response = self.opener.open(urllib2.Request(
response = self.opener.open(Request(
HTTP_BASE_URL + '/' + cmd.strip('/'),
data=payload,
headers=headers
)).read()
except urllib2.HTTPError, err:
if 403 == err.code:
raise AccessDeniedException('Access denied, please check'
' your credentials and/or balance')
elif 400 == err.code or 413 == err.code:
raise ValueError("CAPTCHA was rejected by the service, check"
" if it's a valid image")
elif 503 == err.code:
raise OverflowError("CAPTCHA was rejected due to service"
" overload, try again later")
else:
raise err
except HTTPError as e:
if 403 == e.code:
raise AccessDeniedException(
'Access denied, please check your credentials and/or balance')
elif 400 == e.code or 413 == e.code:
raise ValueError("CAPTCHA was rejected by the service, check if it's a valid image")
else:
self._log('RECV', '%d %s' % (len(response), response))
try:
@ -281,53 +192,38 @@ class HttpClient(Client):
return not self._call('captcha/%d/report' % cid,
self.userpwd.copy()).get('is_correct')
def upload(self, captcha=None, **kwargs):
boundary = binascii.hexlify(os.urandom(16))
banner = kwargs.get('banner', '')
if banner:
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
body = '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in self.userpwd.items())
def remove(self, cid):
return not self._call('captcha/%d/remove' % cid,
self.userpwd.copy()).get('captcha')
body += '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
def upload(self, captcha):
boundary = binascii.hexlify(os.urandom(16))
data = self.userpwd.copy()
data['swid'] = SOFTWARE_VENDOR_ID
body = '\r\n'.join(('\r\n'.join(('--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in kwargs.items())
if captcha:
img = _load_image(captcha)
body += '\r\n'.join((
'',
str(v))))
for k, v in data.items())
captcha = self._load_file(captcha)
body += '\r\n'.join(('',
'--%s' % boundary,
'Content-Disposition: form-data; name="captchafile"; '
'filename="captcha"',
'Content-Disposition: form-data; name="captchafile"; filename="captcha"',
'Content-Type: application/octet-stream',
'Content-Length: %d' % len(img),
'Content-Length: %d' % len(captcha),
'',
img,
captcha,
'--%s--' % boundary,
''
))
''))
response = self._call('captcha', body, {
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
}) or {}
if response.get('captcha'):
return response
class SocketClient(Client):
"""Death by Captcha socket API client."""
TERMINATOR = '\r\n'
@ -357,11 +253,12 @@ class SocketClient(Client):
self.socket.settimeout(0)
try:
self.socket.connect(host)
except socket.error, err:
if (err.args[0] not in
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
except socket.error as e:
if errno.EINPROGRESS == e[0]:
pass
else:
self.close()
raise err
raise e
return self.socket
def __del__(self):
@ -372,30 +269,27 @@ class SocketClient(Client):
fds = [sock]
buf += self.TERMINATOR
response = ''
intvl_idx = 0
while True:
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
rds, wrs, exs = select.select((not buf and fds) or [],
rd, wr, ex = select.select((not buf and fds) or [],
(buf and fds) or [],
fds,
intvl)
if exs:
POLLS_INTERVAL)
if ex:
raise IOError('select() failed')
try:
if wrs:
if wr:
while buf:
buf = buf[wrs[0].send(buf):]
elif rds:
buf = buf[wr[0].send(buf):]
elif rd:
while True:
s = rds[0].recv(256)
s = rd[0].recv(256)
if not s:
raise IOError('recv(): connection lost')
else:
response += s
except socket.error, err:
if (err.args[0] not in
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
raise err
except socket.error as e:
if e[0] not in (errno.EAGAIN, errno.EINPROGRESS):
raise e
if response.endswith(self.TERMINATOR):
self._log('RECV', response)
return response.rstrip(self.TERMINATOR)
@ -409,18 +303,16 @@ class SocketClient(Client):
request = json_encode(data)
response = None
for _ in range(2):
if not self.socket and cmd != 'login':
self._call('login', self.userpwd.copy())
for i in range(2):
self.socket_lock.acquire()
try:
sock = self.connect()
response = self._sendrecv(sock, request)
except IOError, err:
sys.stderr.write(str(err) + "\n")
except IOError as e:
sys.stderr.write(str(e) + "\n")
self.close()
except socket.error, err:
sys.stderr.write(str(err) + "\n")
except socket.error as e:
sys.stderr.write(str(e) + "\n")
self.close()
raise IOError('Connection refused')
else:
@ -428,89 +320,84 @@ class SocketClient(Client):
finally:
self.socket_lock.release()
try:
if response is None:
raise IOError('Connection lost or timed out during API request')
raise IOError('Connection lost timed out during API request')
try:
response = json_decode(response)
except Exception:
raise RuntimeError('Invalid API response')
if not response.get('error'):
return response
if 'error' in response:
error = response['error']
if error in ('not-logged-in', 'invalid-credentials'):
if 'not-logged-in' == error:
raise AccessDeniedException('Access denied, check your credentials')
elif 'banned' == error:
raise AccessDeniedException('Access denied, account is suspended')
elif 'insufficient-funds' == error:
raise AccessDeniedException(
'CAPTCHA was rejected due to low balance')
raise AccessDeniedException('CAPTCHA was rejected due to low balance')
elif 'invalid-captcha' == error:
raise ValueError('CAPTCHA is not a valid image')
elif 'service-overload' == error:
raise OverflowError(
raise ValueError(
'CAPTCHA was rejected due to service overload, try again later')
else:
raise RuntimeError('API server error occured: %s' % error)
except Exception as e:
self.socket_lock.acquire()
self.close()
self.socket_lock.release()
raise RuntimeError('API server error occured: %s' % error)
raise e
else:
return response
def get_user(self):
return self._call('user') or {'user': 0}
return self._call('user', self.userpwd.copy()) or {'user': 0}
def get_captcha(self, cid):
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
def upload(self, captcha=None, **kwargs):
data = {}
if captcha:
data['captcha'] = base64.b64encode(_load_image(captcha))
if kwargs:
banner = kwargs.get('banner', '')
if banner:
kwargs['banner'] = base64.b64encode(_load_image(banner))
data.update(kwargs)
def upload(self, captcha):
data = self.userpwd.copy()
data['captcha'] = base64.b64encode(self._load_file(captcha))
response = self._call('upload', data)
if response.get('captcha'):
uploaded_captcha = dict(
(k, response.get(k))
for k in ('captcha', 'text', 'is_correct')
)
if not uploaded_captcha['text']:
uploaded_captcha['text'] = None
return uploaded_captcha
return dict((k, response.get(k)) for k in ('captcha', 'text', 'is_correct'))
def report(self, cid):
return not self._call('report', {'captcha': cid}).get('is_correct')
data = self.userpwd.copy()
data['captcha'] = cid
return not self._call('report', data).get('is_correct')
def remove(self, cid):
data = self.userpwd.copy()
data['captcha'] = cid
return not self._call('remove', data).get('captcha')
if '__main__' == __name__:
import sys
# Put your DBC username & password here:
#client = HttpClient(sys.argv[1], sys.argv[2])
client = SocketClient(sys.argv[1], sys.argv[2])
client.is_verbose = True
print 'Your balance is %s US cents' % client.get_balance()
print('Your balance is %s US cents' % client.get_balance())
for fn in sys.argv[3:]:
try:
# Put your CAPTCHA image file name or file-like object, and optional
# solving timeout (in seconds) here:
captcha = client.decode(fn, DEFAULT_TIMEOUT)
except Exception, e:
except Exception as e:
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
captcha = None
if captcha:
print 'CAPTCHA %d solved: %s' % \
(captcha['captcha'], captcha['text'])
print('CAPTCHA %d solved: %s' % (captcha['captcha'], captcha['text']))
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
# in fact incorrectly solved!
# try:
# client.report(captcha['captcha'])
# except Exception, e:
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
try:
client.report(captcha['captcha'])
except Exception as e:
sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))

@ -40,7 +40,7 @@ import operator
import itertools
import collections
__version__ = '4.3.0'
__version__ = '4.4.0'
if sys.version >= '3':
from inspect import getfullargspec
@ -65,6 +65,12 @@ except AttributeError:
# let's assume there are no coroutine functions in old Python
def iscoroutinefunction(f):
return False
try:
from inspect import isgeneratorfunction
except ImportError:
# assume no generator function in old Python versions
def isgeneratorfunction(caller):
return False
DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(')
@ -173,7 +179,8 @@ class FunctionMaker(object):
# Ensure each generated function has a unique filename for profilers
# (such as cProfile) that depend on the tuple of (<filename>,
# <definition line>, <function name>) being unique.
filename = '<decorator-gen-%d>' % (next(self._compile_count),)
filename = '<%s:decorator-gen-%d>' % (
__file__, next(self._compile_count))
try:
code = compile(src, filename, 'single')
exec(code, evaldict)
@ -218,6 +225,8 @@ class FunctionMaker(object):
def decorate(func, caller, extras=()):
"""
decorate(func, caller) decorates a function using a caller.
If the caller is a generator function, the resulting function
will be a generator function.
"""
evaldict = dict(_call_=caller, _func_=func)
es = ''
@ -225,6 +234,20 @@ def decorate(func, caller, extras=()):
ex = '_e%d_' % i
evaldict[ex] = extra
es += ex + ', '
if '3.5' <= sys.version < '3.6':
# with Python 3.5 isgeneratorfunction returns True for all coroutines
# however we know that it is NOT possible to have a generator
# coroutine in python 3.5: PEP525 was not there yet
generatorcaller = isgeneratorfunction(
caller) and not iscoroutinefunction(caller)
else:
generatorcaller = isgeneratorfunction(caller)
if generatorcaller:
fun = FunctionMaker.create(
func, "for res in _call_(_func_, %s%%(shortsignature)s):\n"
" yield res" % es, evaldict, __wrapped__=func)
else:
fun = FunctionMaker.create(
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
evaldict, __wrapped__=func)
@ -261,12 +284,12 @@ def decorator(caller, _func=None):
doc = caller.__call__.__doc__
evaldict = dict(_call=caller, _decorate_=decorate)
dec = FunctionMaker.create(
'%s(%s func)' % (name, defaultargs),
'%s(func, %s)' % (name, defaultargs),
'if func is None: return lambda func: _decorate_(func, _call, (%s))\n'
'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs),
evaldict, doc=doc, module=caller.__module__, __wrapped__=caller)
if defaults:
dec.__defaults__ = defaults + (None,)
dec.__defaults__ = (None,) + defaults
return dec

@ -1,4 +1,4 @@
__version__ = '0.6.5'
__version__ = '0.7.1'
from .lock import Lock # noqa
from .lock import NeedRegenerationException # noqa

@ -10,8 +10,9 @@ from ..util import compat
import time
import datetime
from numbers import Number
from functools import wraps
from functools import wraps, partial
import threading
from decorator import decorate
_backend_loader = PluginLoader("dogpile.cache")
register_backend = _backend_loader.register
@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy):
class CacheRegion(object):
"""A front end to a particular cache backend.
r"""A front end to a particular cache backend.
:param name: Optional, a string name for the region.
This isn't used internally
@ -484,6 +485,26 @@ class CacheRegion(object):
else:
return self._LockWrapper()
# cached value
_actual_backend = None
@property
def actual_backend(self):
"""Return the ultimate backend underneath any proxies.
The backend might be the result of one or more ``proxy.wrap``
applications. If so, derive the actual underlying backend.
.. versionadded:: 0.6.6
"""
if self._actual_backend is None:
_backend = self.backend
while hasattr(_backend, 'proxied'):
_backend = _backend.proxied
self._actual_backend = _backend
return self._actual_backend
def invalidate(self, hard=True):
"""Invalidate this :class:`.CacheRegion`.
@ -723,7 +744,8 @@ class CacheRegion(object):
]
def get_or_create(
self, key, creator, expiration_time=None, should_cache_fn=None):
self, key, creator, expiration_time=None, should_cache_fn=None,
creator_args=None):
"""Return a cached value based on the given key.
If the value does not exist or is considered to be expired
@ -759,6 +781,11 @@ class CacheRegion(object):
:param creator: function which creates a new value.
:param creator_args: optional tuple of (args, kwargs) that will be
passed to the creator function if present.
.. versionadded:: 0.7.0
:param expiration_time: optional expiration time which will overide
the expiration time already configured on this :class:`.CacheRegion`
if not None. To set no expiration, use the value -1.
@ -808,6 +835,9 @@ class CacheRegion(object):
return value.payload, ct
def gen_value():
if creator_args:
created_value = creator(*creator_args[0], **creator_args[1])
else:
created_value = creator()
value = self._value(created_value)
@ -831,8 +861,13 @@ class CacheRegion(object):
if self.async_creation_runner:
def async_creator(mutex):
return self.async_creation_runner(
self, orig_key, creator, mutex)
if creator_args:
@wraps(creator)
def go():
return creator(*creator_args[0], **creator_args[1])
else:
go = creator
return self.async_creation_runner(self, orig_key, go, mutex)
else:
async_creator = None
@ -896,7 +931,7 @@ class CacheRegion(object):
if (value is NO_VALUE or value.metadata['v'] != value_version or
self.region_invalidator.is_hard_invalidated(
value.metadata['v'])):
value.metadata['ct'])):
# dogpile.core understands a 0 here as
# "the value is not available", e.g.
# _has_value() will return False.
@ -1228,26 +1263,31 @@ class CacheRegion(object):
if function_key_generator is None:
function_key_generator = self.function_key_generator
def decorator(fn):
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
key = key_generator(*arg, **kw)
timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
return self.get_or_create(key, user_func, timeout,
should_cache_fn, (arg, kw))
def cache_decorator(user_func):
if to_str is compat.string_type:
# backwards compatible
key_generator = function_key_generator(namespace, fn)
key_generator = function_key_generator(namespace, user_func)
else:
key_generator = function_key_generator(
namespace, fn,
namespace, user_func,
to_str=to_str)
@wraps(fn)
def decorate(*arg, **kw):
def refresh(*arg, **kw):
"""
Like invalidate, but regenerates the value instead
"""
key = key_generator(*arg, **kw)
@wraps(fn)
def creator():
return fn(*arg, **kw)
timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
return self.get_or_create(key, creator, timeout,
should_cache_fn)
value = user_func(*arg, **kw)
self.set(key, value)
return value
def invalidate(*arg, **kw):
key = key_generator(*arg, **kw)
@ -1261,20 +1301,18 @@ class CacheRegion(object):
key = key_generator(*arg, **kw)
return self.get(key)
def refresh(*arg, **kw):
key = key_generator(*arg, **kw)
value = fn(*arg, **kw)
self.set(key, value)
return value
user_func.set = set_
user_func.invalidate = invalidate
user_func.get = get
user_func.refresh = refresh
user_func.original = user_func
# Use `decorate` to preserve the signature of :param:`user_func`.
decorate.set = set_
decorate.invalidate = invalidate
decorate.refresh = refresh
decorate.get = get
decorate.original = fn
return decorate(user_func, partial(
get_or_create_for_user_func, key_generator))
return decorate
return decorator
return cache_decorator
def cache_multi_on_arguments(
self, namespace=None, expiration_time=None,
@ -1402,20 +1440,14 @@ class CacheRegion(object):
if function_multi_key_generator is None:
function_multi_key_generator = self.function_multi_key_generator
def decorator(fn):
key_generator = function_multi_key_generator(
namespace, fn,
to_str=to_str)
@wraps(fn)
def decorate(*arg, **kw):
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
cache_keys = arg
keys = key_generator(*arg, **kw)
key_lookup = dict(zip(keys, cache_keys))
@wraps(fn)
@wraps(user_func)
def creator(*keys_to_create):
return fn(*[key_lookup[k] for k in keys_to_create])
return user_func(*[key_lookup[k] for k in keys_to_create])
timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
@ -1447,6 +1479,11 @@ class CacheRegion(object):
return result
def cache_decorator(user_func):
key_generator = function_multi_key_generator(
namespace, user_func,
to_str=to_str)
def invalidate(*arg):
keys = key_generator(*arg)
self.delete_multi(keys)
@ -1466,7 +1503,7 @@ class CacheRegion(object):
def refresh(*arg):
keys = key_generator(*arg)
values = fn(*arg)
values = user_func(*arg)
if asdict:
self.set_multi(
dict(zip(keys, [values[a] for a in arg]))
@ -1478,13 +1515,18 @@ class CacheRegion(object):
)
return values
decorate.set = set_
decorate.invalidate = invalidate
decorate.refresh = refresh
decorate.get = get
user_func.set = set_
user_func.invalidate = invalidate
user_func.refresh = refresh
user_func.get = get
# Use `decorate` to preserve the signature of :param:`user_func`.
return decorate(user_func, partial(get_or_create_for_user_func, key_generator))
return cache_decorator
return decorate
return decorator
def make_region(*arg, **kw):

@ -1,5 +1,4 @@
from hashlib import sha1
import inspect
from ..util import compat
from ..util import langhelpers
@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type):
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn)
args = compat.inspect_getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_key(*args, **kw):
@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn)
args = compat.inspect_getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_keys(*args, **kw):
@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
argspec = inspect.getargspec(fn)
argspec = compat.inspect_getargspec(fn)
default_list = list(argspec.defaults or [])
# Reverse the list, as we want to compare the argspec by negative index,
# meaning default_list[0] should be args[-1], which works well with

@ -69,8 +69,7 @@ class Lock(object):
"""Return true if the expiration time is reached, or no
value is available."""
return not self._has_value(createdtime) or \
(
return not self._has_value(createdtime) or (
self.expiretime is not None and
time.time() - createdtime > self.expiretime
)
@ -91,68 +90,100 @@ class Lock(object):
value = NOT_REGENERATED
createdtime = -1
generated = self._enter_create(createdtime)
generated = self._enter_create(value, createdtime)
if generated is not NOT_REGENERATED:
generated, createdtime = generated
return generated
elif value is NOT_REGENERATED:
# we called upon the creator, and it said that it
# didn't regenerate. this typically means another
# thread is running the creation function, and that the
# cache should still have a value. However,
# we don't have a value at all, which is unusual since we just
# checked for it, so check again (TODO: is this a real codepath?)
try:
value, createdtime = value_fn()
return value
except NeedRegenerationException:
raise Exception("Generation function should "
raise Exception(
"Generation function should "
"have just been called by a concurrent "
"thread.")
else:
return value
def _enter_create(self, createdtime):
def _enter_create(self, value, createdtime):
if not self._is_expired(createdtime):
return NOT_REGENERATED
async = False
_async = False
if self._has_value(createdtime):
has_value = True
if not self.mutex.acquire(False):
log.debug("creation function in progress "
log.debug(
"creation function in progress "
"elsewhere, returning")
return NOT_REGENERATED
else:
has_value = False
log.debug("no value, waiting for create lock")
self.mutex.acquire()
try:
log.debug("value creation lock %r acquired" % self.mutex)
# see if someone created the value already
if not has_value:
# we entered without a value, or at least with "creationtime ==
# 0". Run the "getter" function again, to see if another
# thread has already generated the value while we waited on the
# mutex, or if the caller is otherwise telling us there is a
# value already which allows us to use async regeneration. (the
# latter is used by the multi-key routine).
try:
value, createdtime = self.value_and_created_fn()
except NeedRegenerationException:
# nope, nobody created the value, we're it.
# we must create it right now
pass
else:
has_value = True
# caller is telling us there is a value and that we can
# use async creation if it is expired.
if not self._is_expired(createdtime):
log.debug("value already present")
# it's not expired, return it
log.debug("Concurrent thread created the value")
return value, createdtime
elif self.async_creator:
# otherwise it's expired, call creator again
if has_value and self.async_creator:
# we have a value we can return, safe to use async_creator
log.debug("Passing creation lock to async runner")
# so...run it!
self.async_creator(self.mutex)
async = True
_async = True
# and return the expired value for now
return value, createdtime
log.debug("Calling creation function")
created = self.creator()
return created
# it's expired, and it's our turn to create it synchronously, *or*,
# there's no value at all, and we have to create it synchronously
log.debug(
"Calling creation function for %s value",
"not-yet-present" if not has_value else
"previously expired"
)
return self.creator()
finally:
if not async:
if not _async:
self.mutex.release()
log.debug("Released creation lock")
def __enter__(self):
return self._enter()
def __exit__(self, type, value, traceback):
pass

@ -51,11 +51,33 @@ else:
import thread # noqa
if py3k:
import collections
ArgSpec = collections.namedtuple(
"ArgSpec",
["args", "varargs", "keywords", "defaults"])
from inspect import getfullargspec as inspect_getfullargspec
def inspect_getargspec(func):
return ArgSpec(
*inspect_getfullargspec(func)[0:4]
)
else:
from inspect import getargspec as inspect_getargspec # noqa
if py3k or jython:
import pickle
else:
import cPickle as pickle # noqa
if py3k:
def read_config_file(config, fileobj):
return config.read_file(fileobj)
else:
def read_config_file(config, fileobj):
return config.readfp(fileobj)
def timedelta_total_seconds(td):
if py27:

@ -50,7 +50,7 @@ class NameRegistry(object):
self.creator = creator
def get(self, identifier, *args, **kw):
"""Get and possibly create the value.
r"""Get and possibly create the value.
:param identifier: Hash key for the value.
If the creation function is called, this identifier
@ -75,10 +75,12 @@ class NameRegistry(object):
if identifier in self._values:
return self._values[identifier]
else:
self._values[identifier] = value = self.creator(identifier, *args, **kw)
self._values[identifier] = value = self.creator(
identifier, *args, **kw)
return value
except KeyError:
self._values[identifier] = value = self.creator(identifier, *args, **kw)
self._values[identifier] = value = self.creator(
identifier, *args, **kw)
return value
finally:
self._mutex.release()

@ -23,7 +23,7 @@ class ReadWriteMutex(object):
def __init__(self):
# counts how many asynchronous methods are executing
self.async = 0
self.async_ = 0
# pointer to thread that is the current sync operation
self.current_sync_operation = None
@ -45,7 +45,7 @@ class ReadWriteMutex(object):
if self.current_sync_operation is not None:
return False
self.async += 1
self.async_ += 1
log.debug("%s acquired read lock", self)
finally:
self.condition.release()
@ -57,16 +57,16 @@ class ReadWriteMutex(object):
"""Release the 'read' lock."""
self.condition.acquire()
try:
self.async -= 1
self.async_ -= 1
# check if we are the last asynchronous reader thread
# out the door.
if self.async == 0:
if self.async_ == 0:
# yes. so if a sync operation is waiting, notifyAll to wake
# it up
if self.current_sync_operation is not None:
self.condition.notifyAll()
elif self.async < 0:
elif self.async_ < 0:
raise LockError("Synchronizer error - too many "
"release_read_locks called")
log.debug("%s released read lock", self)
@ -96,7 +96,7 @@ class ReadWriteMutex(object):
self.current_sync_operation = threading.currentThread()
# now wait again for asyncs to finish
if self.async > 0:
if self.async_ > 0:
if wait:
# wait
self.condition.wait()

@ -6,8 +6,16 @@
# s/class \(\w\+\):/class \1(object):/
# Use iterator versions of map and range:
try:
from itertools import imap as map
except ImportError:
imap = map
try:
import xrange
range = xrange
except ImportError:
pass
# Except that xrange only supports machine integers, not longs, so...
def long_range(start, end):

@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
# Bazarr patch to use custom ConfigParser2:
from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError
#try:
# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
#except ImportError:
# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
try:
from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
except ImportError:
from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
class simpleconfigparser(configparser):

@ -1,4 +1,4 @@
# Copyright (c) 2010-2017 Benjamin Peterson
# Copyright (c) 2010-2018 Benjamin Peterson
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@ -29,7 +29,7 @@ import sys
import types
__author__ = "Benjamin Peterson <benjamin@python.org>"
__version__ = "1.11.0"
__version__ = "1.12.0"
# Useful for very coarse version differentiation.
@ -844,10 +844,71 @@ def add_metaclass(metaclass):
orig_vars.pop(slots_var)
orig_vars.pop('__dict__', None)
orig_vars.pop('__weakref__', None)
if hasattr(cls, '__qualname__'):
orig_vars['__qualname__'] = cls.__qualname__
return metaclass(cls.__name__, cls.__bases__, orig_vars)
return wrapper
def ensure_binary(s, encoding='utf-8', errors='strict'):
"""Coerce **s** to six.binary_type.
For Python 2:
- `unicode` -> encoded to `str`
- `str` -> `str`
For Python 3:
- `str` -> encoded to `bytes`
- `bytes` -> `bytes`
"""
if isinstance(s, text_type):
return s.encode(encoding, errors)
elif isinstance(s, binary_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
def ensure_str(s, encoding='utf-8', errors='strict'):
"""Coerce *s* to `str`.
For Python 2:
- `unicode` -> encoded to `str`
- `str` -> `str`
For Python 3:
- `str` -> `str`
- `bytes` -> decoded to `str`
"""
if not isinstance(s, (text_type, binary_type)):
raise TypeError("not expecting type '%s'" % type(s))
if PY2 and isinstance(s, text_type):
s = s.encode(encoding, errors)
elif PY3 and isinstance(s, binary_type):
s = s.decode(encoding, errors)
return s
def ensure_text(s, encoding='utf-8', errors='strict'):
"""Coerce *s* to six.text_type.
For Python 2:
- `unicode` -> `unicode`
- `str` -> `unicode`
For Python 3:
- `str` -> `str`
- `bytes` -> decoded to `str`
"""
if isinstance(s, binary_type):
return s.decode(encoding, errors)
elif isinstance(s, text_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
def python_2_unicode_compatible(klass):
"""
A decorator that defines __unicode__ and __str__ methods under Python 2.

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
__title__ = 'subliminal'
__version__ = '2.1.0.dev'
__version__ = '2.0.5'
__short_version__ = '.'.join(__version__.split('.')[:2])
__author__ = 'Antoine Bertin'
__license__ = 'MIT'

@ -219,12 +219,13 @@ config_file = 'config.ini'
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
help='OpenSubtitles configuration.')
@click.option('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.')
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
show_default=True, expose_value=True, help='Path to the cache directory.')
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
@click.version_option(__version__)
@click.pass_context
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug):
"""Subtitles, faster than your thoughts."""
# create cache directory
try:
@ -252,6 +253,8 @@ def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
if opensubtitles:
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
if subscenter:
ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]}
@subliminal.command()

@ -1,38 +1,19 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import platform
is_windows_special_path = False
if platform.system() == "Windows":
try:
__file__.decode("ascii")
except UnicodeDecodeError:
is_windows_special_path = True
if not is_windows_special_path:
from concurrent.futures import ThreadPoolExecutor
else:
ThreadPoolExecutor = object
from datetime import datetime
import io
import itertools
import logging
import operator
import os
import os.path
import socket
from babelfish import Language, LanguageReverseError
from guessit import guessit
from six.moves.xmlrpc_client import ProtocolError
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
from zipfile import BadZipfile
from ssl import SSLError
from rarfile import NotRarFile, RarCannotExec, RarFile
import requests
from .exceptions import ServiceUnavailable
from .extensions import provider_manager, refiner_manager
from .score import compute_score as default_compute_score
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
@ -98,18 +79,6 @@ class ProviderPool(object):
self.initialized_providers[name].terminate()
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out, improperly terminated', name)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, improperly terminated', name)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
except:
logger.exception('Provider %r terminated unexpectedly', name)
@ -149,18 +118,6 @@ class ProviderPool(object):
return self[provider].list_subtitles(video, provider_languages)
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out', provider)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable', provider)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r http error %r', provider, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r SSL error %r', provider, e.args[0])
except:
logger.exception('Unexpected error in provider %r', provider)
@ -216,28 +173,6 @@ class ProviderPool(object):
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
e.response.status_code)
self.discarded_providers.add(subtitle.provider_name)
return False
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
self.discarded_providers.add(subtitle.provider_name)
return False
except (BadRarFile, BadZipfile):
logger.error('Bad archive for %r', subtitle)
return False
except:
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
@ -557,13 +492,7 @@ def scan_videos(path, age=None, archives=True):
continue
# skip old files
try:
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath))
except ValueError:
logger.warning('Could not get age of file %r in %r', filename, dirpath)
continue
else:
if age and datetime.utcnow() - file_age > age:
if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age:
logger.debug('Skipping old file %r in %r', filename, dirpath)
continue
@ -612,8 +541,7 @@ def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
try:
refiner_manager[refiner].plugin(video, **kwargs)
except:
logger.error('Failed to refine video %r', video.name)
logger.debug('Refiner exception:', exc_info=True)
logger.exception('Failed to refine video')
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):

@ -19,8 +19,8 @@ class AuthenticationError(ProviderError):
pass
class ServiceUnavailable(ProviderError):
"""Exception raised when status is '503 Service Unavailable'."""
class TooManyRequests(ProviderError):
"""Exception raised by providers when too many requests are made."""
pass

@ -29,9 +29,9 @@ class RegistrableExtensionManager(ExtensionManager):
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
def list_entry_points(self):
def _find_entry_points(self, namespace):
# copy of default extensions
eps = list(super(RegistrableExtensionManager, self).list_entry_points())
eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace))
# internal extensions
for iep in self.internal_extensions:
@ -93,6 +93,7 @@ provider_manager = RegistrableExtensionManager('subliminal.providers', [
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
'shooter = subliminal.providers.shooter:ShooterProvider',
'subscenter = subliminal.providers.subscenter:SubsCenterProvider',
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
])

@ -68,9 +68,6 @@ class Provider(object):
#: Required hash, if any
required_hash = None
#: Subtitle class to use
subtitle_class = None
def __enter__(self):
self.initialize()
return self

@ -9,7 +9,7 @@ from requests import Session
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests
from ..score import get_equivalent_release_groups
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize, sanitize_release_group
@ -19,11 +19,8 @@ logger = logging.getLogger(__name__)
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
# Series cell matching regex
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
#: Series header parsing regex
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$')
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')
class Addic7edSubtitle(Subtitle):
@ -32,7 +29,7 @@ class Addic7edSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
download_link):
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link)
self.series = series
self.season = season
self.episode = episode
@ -48,9 +45,8 @@ class Addic7edSubtitle(Subtitle):
def get_matches(self, video):
matches = set()
# series name
if video.series and sanitize(self.series) in (
sanitize(name) for name in [video.series] + video.alternative_series):
# series
if video.series and sanitize(self.series) == sanitize(video.series):
matches.add('series')
# season
if video.season and self.season == video.season:
@ -58,7 +54,7 @@ class Addic7edSubtitle(Subtitle):
# episode
if video.episode and self.episode == video.episode:
matches.add('episode')
# title of the episode
# title
if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title')
# year
@ -90,23 +86,21 @@ class Addic7edProvider(Provider):
]}
video_types = (Episode,)
server_url = 'http://www.addic7ed.com/'
subtitle_class = Addic7edSubtitle
def __init__(self, username=None, password=None):
if any((username, password)) and not all((username, password)):
if username is not None and password is None or username is None and password is not None:
raise ConfigurationError('Username and password must be specified')
self.username = username
self.password = password
self.logged_in = False
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login
if self.username and self.password:
if self.username is not None and self.password is not None:
logger.info('Logging in')
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
@ -140,16 +134,7 @@ class Addic7edProvider(Provider):
logger.info('Getting show ids')
r = self.session.get(self.server_url + 'shows.php', timeout=10)
r.raise_for_status()
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
show_cells = re.findall(show_cells_re, r.content)
if show_cells:
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
else:
# If RegEx fails, fall back to original r.content and use 'html.parser'
soup = ParserBeautifulSoup(r.content, ['html.parser'])
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# populate the show ids
show_ids = {}
@ -181,6 +166,8 @@ class Addic7edProvider(Provider):
logger.info('Searching show ids with %r', params)
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
r.raise_for_status()
if r.status_code == 304:
raise TooManyRequests()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# get the suggestion
@ -231,23 +218,24 @@ class Addic7edProvider(Provider):
# search as last resort
if not show_id:
logger.warning('Series %s not found in show ids', series)
logger.warning('Series not found in show ids')
show_id = self._search_show_id(series)
return show_id
def query(self, show_id, series, season, year=None, country=None):
def query(self, series, season, year=None, country=None):
# get the show id
show_id = self.get_show_id(series, year, country)
if show_id is None:
logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country})
return []
# get the page of the season of the show
logger.info('Getting the page of show id %d, season %d', show_id, season)
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
r.raise_for_status()
if not r.content:
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('No data returned from provider')
return []
if r.status_code == 304:
raise TooManyRequests()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# loop over subtitle rows
@ -274,7 +262,7 @@ class Addic7edProvider(Provider):
version = cells[4].text
download_link = cells[9].a['href'][1:]
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year,
subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year,
version, download_link)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
@ -282,24 +270,8 @@ class Addic7edProvider(Provider):
return subtitles
def list_subtitles(self, video, languages):
# lookup show_id
titles = [video.series] + video.alternative_series
show_id = None
for title in titles:
show_id = self.get_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
return [s for s in self.query(video.series, video.season, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
def download_subtitle(self, subtitle):
# download the subtitle
@ -308,12 +280,6 @@ class Addic7edProvider(Provider):
timeout=10)
r.raise_for_status()
if not r.content:
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('Unable to download subtitle. No data returned from provider')
return
# detect download limit exceeded
if r.headers['Content-Type'] == 'text/html':
raise DownloadLimitExceeded

@ -18,7 +18,7 @@ from zipfile import ZipFile, is_zipfile
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
from ..video import Episode, Movie
@ -44,11 +44,8 @@ rating_re = re.compile(r'nota (?P<rating>\d+)')
#: Timestamp parsing regex
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
#: Title with year/country regex
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
#: Cache key for releases
releases_key = __name__ + ':releases|{archive_id}|{archive_name}'
releases_key = __name__ + ':releases|{archive_id}'
class LegendasTVArchive(object):
@ -63,8 +60,8 @@ class LegendasTVArchive(object):
:param int rating: rating (0-10).
:param timestamp: timestamp.
:type timestamp: datetime.datetime
"""
"""
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
#: Identifier
self.id = id
@ -99,11 +96,10 @@ class LegendasTVArchive(object):
class LegendasTVSubtitle(Subtitle):
"""LegendasTV Subtitle."""
provider_name = 'legendastv'
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link)
super(LegendasTVSubtitle, self).__init__(language, archive.link)
self.type = type
self.title = title
self.year = year
@ -122,12 +118,11 @@ class LegendasTVSubtitle(Subtitle):
# episode
if isinstance(video, Episode) and self.type == 'episode':
# series
if video.series and (sanitize(self.title) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
if video.series and sanitize(self.title) == sanitize(video.series):
matches.add('series')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
# year (year is based on season air date hence the adjustment)
if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1:
matches.add('year')
# imdb_id
@ -137,8 +132,7 @@ class LegendasTVSubtitle(Subtitle):
# movie
elif isinstance(video, Movie) and self.type == 'movie':
# title
if video.title and (sanitize(self.title) in (
sanitize(name) for name in [video.title] + video.alternative_titles)):
if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title')
# year
@ -149,6 +143,9 @@ class LegendasTVSubtitle(Subtitle):
if video.imdb_id and self.imdb_id == video.imdb_id:
matches.add('imdb_id')
# archive name
matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type}))
# name
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
@ -160,38 +157,29 @@ class LegendasTVProvider(Provider):
:param str username: username.
:param str password: password.
"""
"""
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
server_url = 'http://legendas.tv/'
subtitle_class = LegendasTVSubtitle
def __init__(self, username=None, password=None):
# Provider needs UNRAR installed. If not available raise ConfigurationError
try:
rarfile.custom_check(rarfile.UNRAR_TOOL)
except rarfile.RarExecError:
raise ConfigurationError('UNRAR tool not available')
if any((username, password)) and not all((username, password)):
if username and not password or not username and password:
raise ConfigurationError('Username and password must be specified')
self.username = username
self.password = password
self.logged_in = False
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login
if self.username and self.password:
if self.username is not None and self.password is not None:
logger.info('Logging in')
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
raise_for_status(r)
r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['html.parser'])
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
@ -205,87 +193,41 @@ class LegendasTVProvider(Provider):
if self.logged_in:
logger.info('Logging out')
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
raise_for_status(r)
r.raise_for_status()
logger.debug('Logged out')
self.logged_in = False
self.session.close()
@staticmethod
def is_valid_title(title, title_id, sanitized_title, season, year):
"""Check if is a valid title."""
sanitized_result = sanitize(title['title'])
if sanitized_result != sanitized_title:
logger.debug("Mismatched title, discarding title %d (%s)",
title_id, sanitized_result)
return
# episode type
if season:
# discard mismatches on type
if title['type'] != 'episode':
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on season
if 'season' not in title or title['season'] != season:
logger.debug('Mismatched season %s, discarding title %d (%s)',
title.get('season'), title_id, sanitized_result)
return
# movie type
else:
# discard mismatches on type
if title['type'] != 'movie':
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on year
if year is not None and 'year' in title and title['year'] != year:
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
return
return True
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
def search_titles(self, title, season, title_year):
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def search_titles(self, title):
"""Search for titles matching the `title`.
For episodes, each season has it own title
:param str title: the title to search for.
:param int season: season of the title
:param int title_year: year of the title
:return: found titles.
:rtype: dict
"""
titles = {}
sanitized_titles = [sanitize(title)]
ignore_characters = {'\'', '.'}
if any(c in title for c in ignore_characters):
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
for sanitized_title in sanitized_titles:
"""
# make the query
if season:
logger.info('Searching episode title %r for season %r', sanitized_title, season)
else:
logger.info('Searching movie title %r', sanitized_title)
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
raise_for_status(r)
logger.info('Searching title %r', title)
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10)
r.raise_for_status()
results = json.loads(r.text)
# loop over results
titles = {}
for result in results:
source = result['_source']
# extract id
title_id = int(source['id_filme'])
# extract type
title = {'type': type_map[source['tipo']]}
# extract type and title
title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']}
# extract title, year and country
name, year, country = title_re.match(source['dsc_nome']).groups()
title['title'] = name
# extract year
if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
title['year'] = int(source['dsc_data_lancamento'])
# extract imdb_id
if source['id_imdb'] != '0':
@ -303,18 +245,9 @@ class LegendasTVProvider(Provider):
if match:
title['season'] = int(match.group('season'))
else:
logger.debug('No season detected for title %d (%s)', title_id, name)
logger.warning('No season detected for title %d', title_id)
# extract year
if year:
title['year'] = int(year)
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
# year is based on season air date hence the adjustment
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
# add title only if is valid
# Check against title without ignored chars
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
# add title
titles[title_id] = title
logger.debug('Found %d titles', len(titles))
@ -322,57 +255,32 @@ class LegendasTVProvider(Provider):
return titles
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
def get_archives(self, title_id, language_code, title_type, season, episode):
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`.
def get_archives(self, title_id, language_code):
"""Get the archive list from a given `title_id` and `language_code`.
:param int title_id: title id.
:param int language_code: language code.
:param str title_type: episode or movie
:param int season: season
:param int episode: episode
:return: the archives.
:rtype: list of :class:`LegendasTVArchive`
"""
logger.info('Getting archives for title %d and language %d', title_id, language_code)
archives = []
page = 0
page = 1
while True:
# get the archive page
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
language=language_code, page=page, title=title_id)
url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format(
title=title_id, language=language_code, page=page)
r = self.session.get(url)
raise_for_status(r)
r.raise_for_status()
# parse the results
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
for archive_soup in soup.select('div.list_element > article > div'):
# create archive
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
archive_soup.a.text,
'pack' in archive_soup.parent['class'],
'destaque' in archive_soup.parent['class'],
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text,
'pack' in archive_soup['class'], 'destaque' in archive_soup['class'],
self.server_url + archive_soup.a['href'][1:])
# clean name of path separators and pack flags
clean_name = archive.name.replace('/', '-')
if archive.pack and clean_name.startswith('(p)'):
clean_name = clean_name[3:]
# guess from name
guess = guessit(clean_name, {'type': title_type})
# episode
if season and episode:
# discard mismatches on episode in non-pack archives
# Guessit may return int for single episode or list for multi-episode
# Check if archive name has multiple episodes releases on it
if not archive.pack and 'episode' in guess:
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
if not wanted_episode.intersection(archive_episode):
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
continue
# extract text containing downloads, rating and timestamp
data_text = archive_soup.find('p', class_='data').text
@ -392,8 +300,6 @@ class LegendasTVProvider(Provider):
raise ProviderError('Archive timestamp is in the future')
# add archive
logger.info('Found archive for title %d and language %d at page %s: %s',
title_id, language_code, page, archive)
archives.append(archive)
# stop on last page
@ -416,7 +322,7 @@ class LegendasTVProvider(Provider):
"""
logger.info('Downloading archive %s', archive.id)
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
raise_for_status(r)
r.raise_for_status()
# open the archive
archive_stream = io.BytesIO(r.content)
@ -431,26 +337,60 @@ class LegendasTVProvider(Provider):
def query(self, language, title, season=None, episode=None, year=None):
# search for titles
titles = self.search_titles(title, season, year)
titles = self.search_titles(sanitize(title))
# search for titles with the quote or dot character
ignore_characters = {'\'', '.'}
if any(c in title for c in ignore_characters):
titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters)))
subtitles = []
# iterate over titles
for title_id, t in titles.items():
# discard mismatches on title
if sanitize(t['title']) != sanitize(title):
continue
# episode
if season and episode:
# discard mismatches on type
if t['type'] != 'episode':
continue
# discard mismatches on season
if 'season' not in t or t['season'] != season:
continue
# movie
else:
# discard mismatches on type
if t['type'] != 'movie':
continue
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv)
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode)
if not archives:
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
# discard mismatches on year
if year is not None and 'year' in t and t['year'] != year:
continue
# iterate over title's archives
for a in archives:
for a in self.get_archives(title_id, language.legendastv):
# clean name of path separators and pack flags
clean_name = a.name.replace('/', '-')
if a.pack and clean_name.startswith('(p)'):
clean_name = clean_name[3:]
# guess from name
guess = guessit(clean_name, {'type': t['type']})
# episode
if season and episode:
# discard mismatches on episode in non-pack archives
if not a.pack and 'episode' in guess and guess['episode'] != episode:
continue
# compute an expiration time based on the archive timestamp
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
# attempt to get the releases from the cache
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
releases = region.get(cache_key, expiration_time=expiration_time)
releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time)
# the releases are not in cache or cache is expired
if releases == NO_VALUE:
@ -477,11 +417,11 @@ class LegendasTVProvider(Provider):
releases.append(name)
# cache the releases
region.set(cache_key, releases)
region.set(releases_key.format(archive_id=a.id), releases)
# iterate over releases
for r in releases:
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
t.get('season'), a, r)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
@ -491,19 +431,13 @@ class LegendasTVProvider(Provider):
def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode):
titles = [video.series] + video.alternative_series
title = video.series
season = video.season
episode = video.episode
else:
titles = [video.title] + video.alternative_titles
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
title = video.title
return []
return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)]
def download_subtitle(self, subtitle):
# download archive in case we previously hit the releases cache and didn't download it
@ -512,11 +446,3 @@ class LegendasTVProvider(Provider):
# extract subtitle's content
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
def raise_for_status(r):
# When site is under maintaince and http status code 200.
if 'Em breve estaremos de volta' in r.text:
raise ServiceUnavailable
else:
r.raise_for_status()

@ -42,7 +42,6 @@ class NapiProjektSubtitle(Subtitle):
def __init__(self, language, hash):
super(NapiProjektSubtitle, self).__init__(language)
self.hash = hash
self.content = None
@property
def id(self):
@ -63,10 +62,6 @@ class NapiProjektProvider(Provider):
languages = {Language.fromalpha2(l) for l in ['pl']}
required_hash = 'napiprojekt'
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
subtitle_class = NapiProjektSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
@ -86,16 +81,16 @@ class NapiProjektProvider(Provider):
'f': hash,
't': get_subhash(hash)}
logger.info('Searching subtitle %r', params)
r = self.session.get(self.server_url, params=params, timeout=10)
r.raise_for_status()
response = self.session.get(self.server_url, params=params, timeout=10)
response.raise_for_status()
# handle subtitles not found and errors
if r.content[:4] == b'NPc0':
if response.content[:4] == b'NPc0':
logger.debug('No subtitles found')
return None
subtitle = self.subtitle_class(language, hash)
subtitle.content = r.content
subtitle = NapiProjektSubtitle(language, hash)
subtitle.content = response.content
logger.debug('Found subtitle %r', subtitle)
return subtitle

@ -11,8 +11,7 @@ from six.moves.xmlrpc_client import ServerProxy
from . import Provider, TimeoutSafeTransport
from .. import __short_version__
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError,
ServiceUnavailable)
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize
from ..video import Episode, Movie
@ -27,8 +26,7 @@ class OpenSubtitlesSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired,
page_link=page_link, encoding=encoding)
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding)
self.subtitle_id = subtitle_id
self.matched_by = matched_by
self.movie_kind = movie_kind
@ -60,7 +58,6 @@ class OpenSubtitlesSubtitle(Subtitle):
if isinstance(video, Episode) and self.movie_kind == 'episode':
# tag match, assume series, year, season and episode matches
if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
matches |= {'series', 'year', 'season', 'episode'}
# series
if video.series and sanitize(self.series_name) == sanitize(video.series):
@ -90,7 +87,6 @@ class OpenSubtitlesSubtitle(Subtitle):
elif isinstance(video, Movie) and self.movie_kind == 'movie':
# tag match, assume title and year matches
if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
matches |= {'title', 'year'}
# title
if video.title and sanitize(self.movie_name) == sanitize(video.title):
@ -126,11 +122,10 @@ class OpenSubtitlesProvider(Provider):
"""
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
subtitle_class = OpenSubtitlesSubtitle
def __init__(self, username=None, password=None):
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
if any((username, password)) and not all((username, password)):
if username and not password or not username and password:
raise ConfigurationError('Username and password must be specified')
# None values not allowed for logging in, so replace it by ''
self.username = username or ''
@ -161,9 +156,6 @@ class OpenSubtitlesProvider(Provider):
if hash and size:
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
if imdb_id:
if season and episode:
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
else:
criteria.append({'imdbid': imdb_id[2:]})
if tag:
criteria.append({'tag': tag})
@ -207,7 +199,7 @@ class OpenSubtitlesProvider(Provider):
filename = subtitle_item['SubFileName']
encoding = subtitle_item.get('SubEncoding') or None
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
series_season, series_episode, filename, encoding)
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
@ -268,6 +260,11 @@ class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
pass
class ServiceUnavailable(OpenSubtitlesError):
"""Exception raised when status is '503 Service Unavailable'."""
pass
def checked(response):
"""Check a response status before returning it.

@ -31,7 +31,7 @@ class PodnapisiSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
year=None):
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link)
self.pid = pid
self.releases = releases
self.title = title
@ -49,8 +49,7 @@ class PodnapisiSubtitle(Subtitle):
# episode
if isinstance(video, Episode):
# series
if video.series and (sanitize(self.title) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
if video.series and sanitize(self.title) == sanitize(video.series):
matches.add('series')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
@ -67,8 +66,7 @@ class PodnapisiSubtitle(Subtitle):
# movie
elif isinstance(video, Movie):
# title
if video.title and (sanitize(self.title) in (
sanitize(name) for name in [video.title] + video.alternative_titles)):
if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title')
# year
if video.year and self.year == video.year:
@ -84,11 +82,7 @@ class PodnapisiProvider(Provider):
"""Podnapisi Provider."""
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
server_url = 'https://www.podnapisi.net/subtitles/'
subtitle_class = PodnapisiSubtitle
def __init__(self):
self.session = None
server_url = 'http://podnapisi.net/subtitles/'
def initialize(self):
self.session = Session()
@ -114,9 +108,7 @@ class PodnapisiProvider(Provider):
pids = set()
while True:
# query the server
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10)
r.raise_for_status()
xml = etree.fromstring(r.content)
xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content)
# exit if no results
if not int(xml.find('pagination/results').text):
@ -126,14 +118,10 @@ class PodnapisiProvider(Provider):
# loop over subtitles
for subtitle_xml in xml.findall('subtitle'):
# read xml elements
pid = subtitle_xml.find('pid').text
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
if pid in pids:
continue
language = Language.fromietf(subtitle_xml.find('language').text)
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
page_link = subtitle_xml.find('url').text
pid = subtitle_xml.find('pid').text
releases = []
if subtitle_xml.find('release').text:
for release in subtitle_xml.find('release').text.split():
@ -146,12 +134,16 @@ class PodnapisiProvider(Provider):
year = int(subtitle_xml.find('year').text)
if is_episode:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
season=season, episode=episode, year=year)
else:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
year=year)
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
if pid in pids:
continue
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
pids.add(pid)
@ -167,21 +159,11 @@ class PodnapisiProvider(Provider):
return subtitles
def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode):
titles = [video.series] + video.alternative_series
season = video.season
episode = video.episode
else:
titles = [video.title] + video.alternative_titles
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
return []
return [s for l in languages for s in self.query(l, video.series, season=video.season,
episode=video.episode, year=video.year)]
elif isinstance(video, Movie):
return [s for l in languages for s in self.query(l, video.title, year=video.year)]
def download_subtitle(self, subtitle):
# download as a zip

@ -42,10 +42,6 @@ class ShooterProvider(Provider):
"""Shooter Provider."""
languages = {Language(l) for l in ['eng', 'zho']}
server_url = 'https://www.shooter.cn/api/subapi.php'
subtitle_class = ShooterSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
@ -68,7 +64,7 @@ class ShooterProvider(Provider):
# parse the subtitles
results = json.loads(r.text)
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']]
subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']]
return subtitles

@ -26,7 +26,7 @@ class SubsCenterSubtitle(Subtitle):
provider_name = 'subscenter'
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
subtitle_version, downloaded, releases):
downloaded, releases):
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
self.series = series
self.season = season
@ -34,7 +34,6 @@ class SubsCenterSubtitle(Subtitle):
self.title = title
self.subtitle_id = subtitle_id
self.subtitle_key = subtitle_key
self.subtitle_version = subtitle_version
self.downloaded = downloaded
self.releases = releases
@ -75,8 +74,7 @@ class SubsCenterSubtitle(Subtitle):
class SubsCenterProvider(Provider):
"""SubsCenter Provider."""
languages = {Language.fromalpha2(l) for l in ['he']}
server_url = 'http://www.subscenter.org/he/'
subtitle_class = SubsCenterSubtitle
server_url = 'http://www.subscenter.co/he/'
def __init__(self, username=None, password=None):
if username is not None and password is None or username is None and password is not None:
@ -191,7 +189,6 @@ class SubsCenterProvider(Provider):
hearing_impaired = bool(subtitle_item['hearing_impaired'])
subtitle_id = subtitle_item['id']
subtitle_key = subtitle_item['key']
subtitle_version = subtitle_item['h_version']
downloaded = subtitle_item['downloaded']
release = subtitle_item['subtitle_version']
@ -203,9 +200,8 @@ class SubsCenterProvider(Provider):
continue
# otherwise create it
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode,
title, subtitle_id, subtitle_key, subtitle_version, downloaded,
[release])
subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode,
title, subtitle_id, subtitle_key, downloaded, [release])
logger.debug('Found subtitle %r', subtitle)
subtitles[subtitle_id] = subtitle
@ -225,12 +221,11 @@ class SubsCenterProvider(Provider):
def download_subtitle(self, subtitle):
# download
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key}
params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key}
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
r.raise_for_status()
# open the zip
try:
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
# remove some filenames from the namelist
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
@ -238,6 +233,3 @@ class SubsCenterProvider(Provider):
raise ProviderError('More than one file to unzip')
subtitle.content = fix_line_ending(zf.read(namelist[0]))
except zipfile.BadZipfile:
# if no zip file was retrieved, daily downloads limit has exceeded
raise ProviderError('Daily limit exceeded')

@ -40,10 +40,6 @@ class TheSubDBProvider(Provider):
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
required_hash = 'thesubdb'
server_url = 'http://api.thesubdb.com/'
subtitle_class = TheSubDBSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
@ -70,7 +66,7 @@ class TheSubDBProvider(Provider):
for language_code in r.text.split(','):
language = Language.fromthesubdb(language_code)
subtitle = self.subtitle_class(language, hash)
subtitle = TheSubDBSubtitle(language, hash)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)

@ -47,8 +47,7 @@ class TVsubtitlesSubtitle(Subtitle):
matches = set()
# series
if video.series and (sanitize(self.series) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
if video.series and sanitize(self.series) == sanitize(video.series):
matches.add('series')
# season
if video.season and self.season == video.season:
@ -81,10 +80,6 @@ class TVsubtitlesProvider(Provider):
]}
video_types = (Episode,)
server_url = 'http://www.tvsubtitles.net/'
subtitle_class = TVsubtitlesSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
@ -163,7 +158,13 @@ class TVsubtitlesProvider(Provider):
return episode_ids
def query(self, show_id, series, season, episode, year=None):
def query(self, series, season, episode, year=None):
# search the show id
show_id = self.search_show_id(series, year)
if show_id is None:
logger.error('No show id found for %r (%r)', series, {'year': year})
return []
# get the episode ids
episode_ids = self.get_episode_ids(show_id, season)
if episode not in episode_ids:
@ -183,9 +184,9 @@ class TVsubtitlesProvider(Provider):
subtitle_id = int(row.parent['href'][10:-5])
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
rip = row.find('p', title='rip').text.strip() or None
release = row.find('h5').text.strip() or None
release = row.find('p', title='release').text.strip() or None
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip,
release)
logger.debug('Found subtitle %s', subtitle)
subtitles.append(subtitle)
@ -193,24 +194,7 @@ class TVsubtitlesProvider(Provider):
return subtitles
def list_subtitles(self, video, languages):
# lookup show_id
titles = [video.series] + video.alternative_series
show_id = None
for title in titles:
show_id = self.search_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages]
def download_subtitle(self, subtitle):
# download as a zip

@ -3,7 +3,7 @@ from datetime import datetime, timedelta
from functools import wraps
import logging
import re
import _strptime
import requests
from .. import __short_version__
@ -331,7 +331,6 @@ def refine(video, **kwargs):
# add series information
logger.debug('Found series %r', series)
video.series = matching_result['match']['series']
video.alternative_series.extend(series['aliases'])
video.year = matching_result['match']['year']
video.original_series = matching_result['match']['original_series']
video.series_tvdb_id = series['id']

@ -44,7 +44,7 @@ movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
#: Equivalent release groups
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'})
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'})
def get_equivalent_release_groups(release_group):

@ -208,13 +208,7 @@ def guess_matches(video, guess, partial=False):
if video.season and 'season' in guess and guess['season'] == video.season:
matches.add('season')
# episode
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
# Most providers only support single-ep, so make sure it contains only 1 episode
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
if video.episode and 'episode' in guess:
episode_guess = guess['episode']
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
if episode == video.episode:
if video.episode and 'episode' in guess and guess['episode'] == video.episode:
matches.add('episode')
# year
if video.year and 'year' in guess and guess['year'] == video.year:
@ -258,4 +252,4 @@ def fix_line_ending(content):
:rtype: bytes
"""
return content.replace(b'\r\n', b'\n')
return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')

@ -13,9 +13,9 @@ VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf',
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm' '.ogv', '.omf',
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
'.vob', '.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
class Video(object):
@ -123,12 +123,11 @@ class Episode(Video):
:param int year: year of the series.
:param bool original_series: whether the series is the first with this name.
:param int tvdb_id: TVDB id of the episode.
:param list alternative_series: alternative names of the series
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
"""
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs):
series_tvdb_id=None, series_imdb_id=None, **kwargs):
super(Episode, self).__init__(name, **kwargs)
#: Series of the episode
@ -158,9 +157,6 @@ class Episode(Video):
#: IMDb id of the series
self.series_imdb_id = series_imdb_id
#: Alternative names of the series
self.alternative_series = alternative_series or []
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'episode':
@ -169,13 +165,7 @@ class Episode(Video):
if 'title' not in guess or 'episode' not in guess:
raise ValueError('Insufficient data to process the guess')
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
# Most providers only support single-ep, so make sure it contains only 1 episode
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
episode_guess = guess.get('episode')
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'),
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
@ -196,11 +186,10 @@ class Movie(Video):
:param str title: title of the movie.
:param int year: year of the movie.
:param list alternative_titles: alternative titles of the movie
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
"""
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs):
def __init__(self, name, title, year=None, **kwargs):
super(Movie, self).__init__(name, **kwargs)
#: Title of the movie
@ -209,9 +198,6 @@ class Movie(Video):
#: Year of the movie
self.year = year
#: Alternative titles of the movie
self.alternative_titles = alternative_titles or []
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'movie':
@ -220,13 +206,9 @@ class Movie(Video):
if 'title' not in guess:
raise ValueError('Insufficient data to process the guess')
alternative_titles = []
if 'alternative_title' in guess:
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
audio_codec=guess.get('audio_codec'), year=guess.get('year'))
@classmethod
def fromname(cls, name):

@ -10,7 +10,7 @@ import time
import operator
import itertools
from httplib import ResponseNotReady
from http.client import ResponseNotReady
import rarfile
import requests
@ -21,14 +21,13 @@ from babelfish import LanguageReverseError
from guessit.jsonutils import GuessitEncoder
from subliminal import ProviderError, refiner_manager
from extensions import provider_registry
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded
from subliminal_patch.extensions import provider_registry
from subliminal.score import compute_score as default_compute_score
from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
ThreadPoolExecutor, check_video
from subliminal_patch.exceptions import TooManyRequests, APIThrottled
from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ServiceUnavailable, DownloadLimitExceeded
from subzero.language import Language
from scandir import scandir, scandir_generic as _scandir_generic
@ -186,7 +185,7 @@ class SZProviderPool(ProviderPool):
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out', provider)
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
self.throttle_callback(provider, e)
return
@ -283,7 +282,7 @@ class SZProviderPool(ProviderPool):
logger.debug("RAR Traceback: %s", traceback.format_exc())
return False
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
self.throttle_callback(subtitle.provider_name, e)
self.discarded_providers.add(subtitle.provider_name)
return False
@ -648,7 +647,7 @@ def search_external_subtitles(path, languages=None, only_one=False):
abspath = unicode(os.path.abspath(
os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder,
video_filename])))
except Exception, e:
except Exception as e:
logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e)
continue
logger.debug("external subs: scanning path %s", abspath)

@ -9,3 +9,13 @@ class TooManyRequests(ProviderError):
class APIThrottled(ProviderError):
pass
class ServiceUnavailable(ProviderError):
"""Exception raised when status is '503 Service Unavailable'."""
pass
class DownloadLimitExceeded(ProviderError):
"""Exception raised by providers when download limit is exceeded."""
pass

@ -8,7 +8,7 @@ import os
import socket
import logging
import requests
import xmlrpclib
import xmlrpc.client
import dns.resolver
import ipaddress
import re
@ -16,7 +16,7 @@ import re
from requests import exceptions
from urllib3.util import connection
from retry.api import retry_call
from exceptions import APIThrottled
from .exceptions import APIThrottled
from dogpile.cache.api import NO_VALUE
from subliminal.cache import region
from subliminal_patch.pitcher import pitchers
@ -32,10 +32,8 @@ try:
except ImportError:
from urllib.parse import urlparse
from subzero.lib.io import get_viable_encoding
logger = logging.getLogger(__name__)
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where()))
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where()))
try:
default_ssl_context = ssl.create_default_context(cafile=pem_file)
except AttributeError:
@ -99,7 +97,7 @@ class CFSession(CloudScraper):
# Solve Challenge
resp = self.sendChallengeResponse(resp, **kwargs)
except ValueError, e:
except ValueError as e:
if e.message == "Captcha":
parsed_url = urlparse(url)
domain = parsed_url.netloc
@ -231,7 +229,7 @@ class RetryingCFSession(RetryingSession, CFSession):
pass
class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
class SubZeroRequestsTransport(xmlrpc.client.SafeTransport):
"""
Drop in Transport for xmlrpclib that uses Requests instead of httplib

@ -8,7 +8,7 @@ from subliminal.cache import region
from dogpile.cache.api import NO_VALUE
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
Proxy
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TIMEOUT
logger = logging.getLogger(__name__)
@ -185,7 +185,7 @@ class DBCProxyLessPitcher(Pitcher):
password = None
def __init__(self, website_name, website_url, website_key,
timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs):
timeout=DEFAULT_TIMEOUT, tries=3, *args, **kwargs):
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
self.username, self.password = self.client_key.split(":", 1)

@ -5,7 +5,7 @@ import datetime
from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \
get_series_episode, region, tvdb_client
from util import fix_session_bases
from .util import fix_session_bases
TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds()

@ -272,9 +272,9 @@ class Subtitle(Subtitle_):
def prepare_text(text, style):
body = []
for fragment, sty in parse_tags(text, style, sub.styles):
fragment = fragment.replace(ur"\h", u" ")
fragment = fragment.replace(ur"\n", u"\n")
fragment = fragment.replace(ur"\N", u"\n")
fragment = fragment.replace(r"\h", u" ")
fragment = fragment.replace(r"\n", u"\n")
fragment = fragment.replace(r"\N", u"\n")
if format == "srt":
if sty.italic:
fragment = u"<i>%s</i>" % fragment

@ -1,2 +1,8 @@
import dict, geezip, httpfake, io, json, rar, which
from .dict import *
from .geezip import *
from .httpfake import *
from .io import *
from .json import *
from .rar import *
from .which import *

@ -28,7 +28,7 @@ class GeezipFile(gzip.GzipFile):
fileobj.write(self.compress.flush(Z_FINISH))
gzip.write32u(fileobj, self.crc)
# self.size may exceed 2GB, or even 4GB
gzip.write32u(fileobj, self.size & 0xffffffffL)
gzip.write32u(fileobj, self.size & 0xffffffff)
fileobj.flush()
finally:
myfileobj = self.myfileobj

@ -1,5 +1,5 @@
# coding=utf-8
from registry import registry
from mods import hearing_impaired, ocr_fixes, fps, offset, common, color
from main import SubtitleModifications, SubMod
from .registry import registry
from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color
from .main import SubtitleModifications, SubMod

@ -1,3 +1,3 @@
# coding=utf-8
from data import data
from .data import data

File diff suppressed because one or more lines are too long

@ -6,14 +6,14 @@ import pysubs2
import logging
import time
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from registry import registry
from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from .registry import registry
from subzero.language import Language
logger = logging.getLogger(__name__)
lowercase_re = re.compile(ur'(?sux)[a-zà-ž]')
lowercase_re = re.compile(r'(?sux)[a-zà-ž]')
class SubtitleModifications(object):
@ -143,7 +143,7 @@ class SubtitleModifications(object):
continue
# clear empty args
final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems()))
final_mod_args = dict(filter(lambda kv: bool(kv[1]), args.iteritems()))
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
if _data == mods_merged_log[identifier]["final_identifier"]:
@ -180,7 +180,7 @@ class SubtitleModifications(object):
entries_used = 0
for entry in self.f:
entry_used = False
for sub in entry.text.strip().split("\N"):
for sub in entry.text.strip().split(r"\N"):
# skip HI bracket entries, those might actually be lowercase
sub = sub.strip()
for processor in registry.mods["remove_HI"].processors[:4]:
@ -272,7 +272,7 @@ class SubtitleModifications(object):
continue
skip_entry = False
for line in t.split(ur"\N"):
for line in t.split(r"\N"):
# don't bother the mods with surrounding tags
old_line = line
line = line.strip()
@ -377,7 +377,7 @@ class SubtitleModifications(object):
logger.debug(u"%d: %r -> ''", index, entry.text)
continue
new_text = ur"\N".join(lines)
new_text = r"\N".join(lines)
# cheap man's approach to avoid open tags
add_start_tags = []

@ -95,7 +95,7 @@ class SubtitleTextModification(SubtitleModification):
pass
TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*"
TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*"
EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag")
empty_line_post_processors = [

@ -22,10 +22,10 @@ class CommonFixes(SubtitleTextModification):
processors = [
# normalize hyphens
NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
# -- = em dash
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1", name="CM_multidash"),
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1", name="CM_multidash"),
# line = _/-/\s
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
@ -37,23 +37,23 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
# fix music symbols
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
lambda x: u"" if x.group(1) else u"",
name="CM_music_symbols"),
# '' = "
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
# double quotes instead of single quotes inside words
NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"),
NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
# normalize quotes
NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
name="CM_normalize_quotes"),
# normalize single quotes
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
# remove leading ...
NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
@ -89,8 +89,8 @@ class CommonFixes(SubtitleTextModification):
# space before ending doublequote?
# replace uppercase I with lowercase L in words
NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'),
lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))),
NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
name="CM_uppercase_i_in_word"),
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
@ -101,11 +101,11 @@ class CommonFixes(SubtitleTextModification):
name="CM_spaces_in_numbers"),
# uppercase after dot
NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
# remove double interpunction
NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
name="CM_double_interpunct"),
@ -149,14 +149,14 @@ class ReverseRTL(SubtitleModification):
processors = [
# new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
#NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
#NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
# name="CM_RTL_reverse")
NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
name="CM_RTL_reverse")
]
split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)")
split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")
class FixUppercase(SubtitleModification):

@ -26,71 +26,71 @@ class HearingImpaired(SubtitleTextModification):
processors = [
# full bracket entry, single or multiline; starting with brackets and ending with brackets
FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
"", name="HI_brackets_full"),
# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
# possibly with a dash in front; ignore anything ending with a quote
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
name="HI_before_colon_caps"),
# any text before colon (at least 3 chars); at start or after a sentence,
# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
# a space is inside the text; ignore anything ending with a quote
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
lambda match:
match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
else "" if not match.group(1).startswith(" ") else " ",
name="HI_before_colon_noncaps"),
# brackets (only remove if at least 3 chars in brackets)
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
{"t": TAG}), "", name="HI_brackets"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
#NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
# "", name="HI_bracket_open_start"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
#NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
# name="HI_bracket_open_end"),
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
# starting text before colon (at least 3 chars)
#NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
#NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
# name="HI_before_colon"),
# text in brackets at start, after optional dash, before colon or at end of line
# fixme: may be too aggressive
#NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
#NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
# name="HI_brackets_special"),
# all caps line (at least 4 consecutive uppercase chars)
NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
supported=lambda p: not p.only_uppercase),
# remove MAN:
NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
# dash in front
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
# all caps at start before new sentence
NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
]
post_processors = empty_line_post_processors
last_processors = [
# remove music symbols
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
"", name="HI_music_symbols_only"),
# remove music entries
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
"", name="HI_music"),
]

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save