pull/684/head
Louis Vézina 5 years ago
parent 4e7e3a39d2
commit 645952c61a

@ -1,5 +1,7 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
from __future__ import print_function
import subprocess as sp import subprocess as sp
import time import time
import os import os
@ -12,14 +14,16 @@ from bazarr.get_args import args
def check_python_version(): def check_python_version():
python_version = platform.python_version_tuple() python_version = platform.python_version_tuple()
minimum_python_version_tuple = (2, 7, 13) minimum_python_version_tuple = (2, 7, 13)
minimum_python3_version_tuple = (3, 6, 0)
minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple) minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple)
minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple)
if int(python_version[0]) > minimum_python_version_tuple[0]: if int(python_version[0]) == minimum_python3_version_tuple[0] and int(python_version[1]) < minimum_python3_version_tuple[1]:
print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater." print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
os._exit(0) os._exit(0)
elif int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]: elif int(python_version[0]) == minimum_python_version_tuple[0] and (int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]):
print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python." print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
os._exit(0) os._exit(0)
@ -32,10 +36,10 @@ def start_bazarr():
script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:] script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:]
ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE) ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE)
print "Bazarr starting..." print("Bazarr starting...")
try: try:
for line in iter(ep.stdout.readline, ''): for line in iter(ep.stdout.readline, ''):
sys.stdout.write(line) sys.stdout.buffer.write(line)
except KeyboardInterrupt: except KeyboardInterrupt:
pass pass
@ -60,16 +64,16 @@ if __name__ == '__main__':
try: try:
os.remove(stopfile) os.remove(stopfile)
except: except:
print 'Unable to delete stop file.' print('Unable to delete stop file.')
else: else:
print 'Bazarr exited.' print('Bazarr exited.')
os._exit(0) os._exit(0)
if os.path.exists(restartfile): if os.path.exists(restartfile):
try: try:
os.remove(restartfile) os.remove(restartfile)
except: except:
print 'Unable to delete restart file.' print('Unable to delete restart file.')
else: else:
start_bazarr() start_bazarr()

@ -1,6 +1,7 @@
# coding=utf-8 # coding=utf-8
import cPickle as pickle from __future__ import absolute_import
import six.moves.cPickle as pickle
import base64 import base64
import random import random
import platform import platform
@ -30,7 +31,7 @@ def track_event(category=None, action=None, label=None):
visitor = pickle.loads(base64.b64decode(settings.analytics.visitor)) visitor = pickle.loads(base64.b64decode(settings.analytics.visitor))
except: except:
visitor = Visitor() visitor = Visitor()
unique_id = long(random.getrandbits(32)) unique_id = int(random.getrandbits(32))
visitor.unique_id = unique_id visitor.unique_id = unique_id
session = Session() session = Session()

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import logging import logging
import json import json

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
from simpleconfigparser import simpleconfigparser from simpleconfigparser import simpleconfigparser

@ -1,3 +1,4 @@
from __future__ import absolute_import
import os import os
import atexit import atexit

@ -1,3 +1,4 @@
from __future__ import absolute_import
import enzyme import enzyme
import logging import logging
import os import os

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import argparse import argparse

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import requests import requests
import logging import logging

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import pycountry import pycountry

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import requests import requests
import logging import logging
@ -13,6 +14,7 @@ from list_subtitles import store_subtitles_movie, list_missing_subtitles_movies,
from get_subtitle import movies_download_subtitles from get_subtitle import movies_download_subtitles
from database import TableMovies, wal_cleaning from database import TableMovies, wal_cleaning
import six
def update_all_movies(): def update_all_movies():
@ -82,7 +84,7 @@ def update_movies():
if movie["path"] != None and movie['movieFile']['relativePath'] != None: if movie["path"] != None and movie['movieFile']['relativePath'] != None:
try: try:
overview = unicode(movie['overview']) overview = six.text_type(movie['overview'])
except: except:
overview = "" overview = ""
try: try:
@ -136,27 +138,27 @@ def update_movies():
audioCodec = None audioCodec = None
# Add movies in radarr to current movies list # Add movies in radarr to current movies list
current_movies_radarr.append(unicode(movie['tmdbId'])) current_movies_radarr.append(six.text_type(movie['tmdbId']))
if unicode(movie['tmdbId']) in current_movies_db_list: if six.text_type(movie['tmdbId']) in current_movies_db_list:
movies_to_update.append({'radarr_id': movie["id"], movies_to_update.append({'radarr_id': movie["id"],
'title': unicode(movie["title"]), 'title': six.text_type(movie["title"]),
'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']), 'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']),
'tmdb_id': unicode(movie["tmdbId"]), 'tmdb_id': six.text_type(movie["tmdbId"]),
'poster': unicode(poster), 'poster': six.text_type(poster),
'fanart': unicode(fanart), 'fanart': six.text_type(fanart),
'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)), 'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
'scene_name': sceneName, 'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])), 'monitored': six.text_type(bool(movie['monitored'])),
'year': unicode(movie['year']), 'year': six.text_type(movie['year']),
'sort_title': unicode(movie['sortTitle']), 'sort_title': six.text_type(movie['sortTitle']),
'alternative_titles': unicode(alternativeTitles), 'alternative_titles': six.text_type(alternativeTitles),
'format': unicode(format), 'format': six.text_type(format),
'resolution': unicode(resolution), 'resolution': six.text_type(resolution),
'video_codec': unicode(videoCodec), 'video_codec': six.text_type(videoCodec),
'audio_codec': unicode(audioCodec), 'audio_codec': six.text_type(audioCodec),
'overview': unicode(overview), 'overview': six.text_type(overview),
'imdb_id': unicode(imdbId)}) 'imdb_id': six.text_type(imdbId)})
else: else:
if movie_default_enabled is True: if movie_default_enabled is True:
movies_to_add.append({'radarr_id': movie["id"], movies_to_add.append({'radarr_id': movie["id"],
@ -171,7 +173,7 @@ def update_movies():
'fanart': fanart, 'fanart': fanart,
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
'scene_name': sceneName, 'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])), 'monitored': six.text_type(bool(movie['monitored'])),
'sort_title': movie['sortTitle'], 'sort_title': movie['sortTitle'],
'year': movie['year'], 'year': movie['year'],
'alternative_titles': alternativeTitles, 'alternative_titles': alternativeTitles,
@ -191,7 +193,7 @@ def update_movies():
'fanart': fanart, 'fanart': fanart,
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
'scene_name': sceneName, 'scene_name': sceneName,
'monitored': unicode(bool(movie['monitored'])), 'monitored': six.text_type(bool(movie['monitored'])),
'sort_title': movie['sortTitle'], 'sort_title': movie['sortTitle'],
'year': movie['year'], 'year': movie['year'],
'alternative_titles': alternativeTitles, 'alternative_titles': alternativeTitles,

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import datetime import datetime
import logging import logging
@ -159,8 +160,8 @@ def provider_throttle(name, exception):
def throttled_count(name): def throttled_count(name):
global throttle_count global throttle_count
if name in throttle_count.keys(): if name in list(throttle_count.keys()):
if 'count' in throttle_count[name].keys(): if 'count' in list(throttle_count[name].keys()):
for key, value in throttle_count[name].items(): for key, value in throttle_count[name].items():
if key == 'count': if key == 'count':
value += 1 value += 1

@ -1,5 +1,7 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
from __future__ import print_function
import os import os
import requests import requests
import logging import logging
@ -12,6 +14,7 @@ from config import settings, url_sonarr
from list_subtitles import list_missing_subtitles from list_subtitles import list_missing_subtitles
from database import TableShows from database import TableShows
from utils import get_sonarr_version from utils import get_sonarr_version
import six
def update_series(): def update_series():
@ -60,7 +63,7 @@ def update_series():
for i, show in enumerate(r.json(), 1): for i, show in enumerate(r.json(), 1):
notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength) notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength)
try: try:
overview = unicode(show['overview']) overview = six.text_type(show['overview'])
except: except:
overview = "" overview = ""
try: try:
@ -82,17 +85,17 @@ def update_series():
current_shows_sonarr.append(show['tvdbId']) current_shows_sonarr.append(show['tvdbId'])
if show['tvdbId'] in current_shows_db_list: if show['tvdbId'] in current_shows_db_list:
series_to_update.append({'title': unicode(show["title"]), series_to_update.append({'title': six.text_type(show["title"]),
'path': unicode(show["path"]), 'path': six.text_type(show["path"]),
'tvdb_id': int(show["tvdbId"]), 'tvdb_id': int(show["tvdbId"]),
'sonarr_series_id': int(show["id"]), 'sonarr_series_id': int(show["id"]),
'overview': unicode(overview), 'overview': six.text_type(overview),
'poster': unicode(poster), 'poster': six.text_type(poster),
'fanart': unicode(fanart), 'fanart': six.text_type(fanart),
'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)), 'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
'sort_title': unicode(show['sortTitle']), 'sort_title': six.text_type(show['sortTitle']),
'year': unicode(show['year']), 'year': six.text_type(show['year']),
'alternate_titles': unicode(alternateTitles)}) 'alternate_titles': six.text_type(alternateTitles)})
else: else:
if serie_default_enabled is True: if serie_default_enabled is True:
series_to_add.append({'title': show["title"], series_to_add.append({'title': show["title"],
@ -161,9 +164,9 @@ def update_series():
removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr)) removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr))
for series in removed_series: for series in removed_series:
print TableShows.delete().where( print(TableShows.delete().where(
TableShows.tvdb_id == series TableShows.tvdb_id == series
).execute() ).execute())
logging.debug('BAZARR All series synced from Sonarr into database.') logging.debug('BAZARR All series synced from Sonarr into database.')

@ -1,12 +1,13 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import sys import sys
import ast import ast
import logging import logging
import subprocess import subprocess
import time import time
import cPickle as pickle import six.moves.cPickle as pickle
import codecs import codecs
import types import types
import re import re
@ -37,6 +38,9 @@ from database import TableShows, TableEpisodes, TableMovies, TableHistory, Table
from peewee import fn, JOIN from peewee import fn, JOIN
from analytics import track_event from analytics import track_event
import six
from six.moves import range
from functools import reduce
def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"): def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"):
@ -91,11 +95,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco
""" """
max_score = 120.0 max_score = 120.0
min_score = max_score * min_score_movie_perc / 100.0 min_score = max_score * min_score_movie_perc / 100.0
scores = subliminal_scores.movie_scores.keys() scores = list(subliminal_scores.movie_scores.keys())
if media_type == "series": if media_type == "series":
max_score = 360.0 max_score = 360.0
min_score = max_score * min_score_series_perc / 100.0 min_score = max_score * min_score_series_perc / 100.0
scores = subliminal_scores.episode_scores.keys() scores = list(subliminal_scores.episode_scores.keys())
if video.is_special: if video.is_special:
min_score = max_score * min_score_special_ep / 100.0 min_score = max_score * min_score_special_ep / 100.0
@ -119,7 +123,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
hi = "force non-HI" hi = "force non-HI"
language_set = set() language_set = set()
if not isinstance(language, types.ListType): if not isinstance(language, list):
language = [language] language = [language]
if forced == "True": if forced == "True":
@ -185,7 +189,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
saved_any = False saved_any = False
if downloaded_subtitles: if downloaded_subtitles:
for video, subtitles in downloaded_subtitles.iteritems(): for video, subtitles in six.iteritems(downloaded_subtitles):
if not subtitles: if not subtitles:
continue continue
@ -221,10 +225,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
else: else:
action = "downloaded" action = "downloaded"
if video.used_scene_name: if video.used_scene_name:
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName
else: else:
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing." round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing."
if use_postprocessing is True: if use_postprocessing is True:
@ -444,7 +448,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
downloaded_path = saved_subtitle.storage_path downloaded_path = saved_subtitle.storage_path
logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path) logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path)
is_forced_string = " forced" if subtitle.language.forced else "" is_forced_string = " forced" if subtitle.language.forced else ""
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + unicode( message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type(
score) + "% using manual search." score) + "% using manual search."
if use_postprocessing is True: if use_postprocessing is True:
@ -749,7 +753,7 @@ def wanted_download_subtitles(path, l, count_episodes):
for episode in episodes_details: for episode in episodes_details:
attempt = episode.failed_attempts attempt = episode.failed_attempts
if type(attempt) == unicode: if type(attempt) == six.text_type:
attempt = ast.literal_eval(attempt) attempt = ast.literal_eval(attempt)
for language in ast.literal_eval(episode.missing_subtitles): for language in ast.literal_eval(episode.missing_subtitles):
if attempt is None: if attempt is None:
@ -762,7 +766,7 @@ def wanted_download_subtitles(path, l, count_episodes):
TableEpisodes.update( TableEpisodes.update(
{ {
TableEpisodes.failed_attempts: unicode(attempt) TableEpisodes.failed_attempts: six.text_type(attempt)
} }
).where( ).where(
TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id
@ -818,7 +822,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
for movie in movies_details: for movie in movies_details:
attempt = movie.failed_attempts attempt = movie.failed_attempts
if type(attempt) == unicode: if type(attempt) == six.text_type:
attempt = ast.literal_eval(attempt) attempt = ast.literal_eval(attempt)
for language in ast.literal_eval(movie.missing_subtitles): for language in ast.literal_eval(movie.missing_subtitles):
if attempt is None: if attempt is None:
@ -831,7 +835,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
TableMovies.update( TableMovies.update(
{ {
TableMovies.failed_attempts: unicode(attempt) TableMovies.failed_attempts: six.text_type(attempt)
} }
).where( ).where(
TableMovies.radarr_id == movie.radarr_id TableMovies.radarr_id == movie.radarr_id
@ -991,7 +995,7 @@ def refine_from_db(path, video):
TableMovies.audio_codec, TableMovies.audio_codec,
TableMovies.imdb_id TableMovies.imdb_id
).where( ).where(
TableMovies.path == unicode(path_replace_reverse_movie(path)) TableMovies.path == six.text_type(path_replace_reverse_movie(path))
).first() ).first()
if data: if data:

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import ast import ast
import os import os
import re import re
@ -126,7 +127,7 @@ def force_unicode(s):
:param s: string :param s: string
:return: unicode string :return: unicode string
""" """
if not isinstance(s, types.UnicodeType): if not isinstance(s, str):
try: try:
s = s.decode("utf-8") s = s.decode("utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:

@ -1,12 +1,13 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import logging import logging
import time import time
import rarfile import rarfile
from cork import Cork from cork import Cork
from ConfigParser2 import ConfigParser from backports import configparser2
from config import settings from config import settings
from check_update import check_releases from check_update import check_releases
from get_args import args from get_args import args
@ -66,7 +67,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')):
config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini')) config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini'))
cfg = ConfigParser() cfg = configparser2.ConfigParser()
def init_binaries(): def init_binaries():

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import sys import sys

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import gc import gc
import os import os
import babelfish import babelfish
@ -24,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
from queueconfig import notifications from queueconfig import notifications
from embedded_subs_reader import embedded_subs_reader from embedded_subs_reader import embedded_subs_reader
import six
gc.enable() gc.enable()
@ -63,7 +65,7 @@ def store_subtitles(file):
logging.exception("BAZARR unable to index external subtitles.") logging.exception("BAZARR unable to index external subtitles.")
pass pass
else: else:
for subtitle, language in subtitles.iteritems(): for subtitle, language in six.iteritems(subtitles):
subtitle_path = get_external_subtitles_path(file, subtitle) subtitle_path = get_external_subtitles_path(file, subtitle)
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)): if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)):
logging.debug("BAZARR external subtitles detected: " + "pb") logging.debug("BAZARR external subtitles detected: " + "pb")
@ -155,7 +157,7 @@ def store_subtitles_movie(file):
logging.exception("BAZARR unable to index external subtitles.") logging.exception("BAZARR unable to index external subtitles.")
pass pass
else: else:
for subtitle, language in subtitles.iteritems(): for subtitle, language in six.iteritems(subtitles):
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True: if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True:
logging.debug("BAZARR external subtitles detected: " + "pb") logging.debug("BAZARR external subtitles detected: " + "pb")
actual_subtitles.append( actual_subtitles.append(

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import logging import logging
import re import re
@ -9,6 +10,7 @@ import platform
from logging.handlers import TimedRotatingFileHandler from logging.handlers import TimedRotatingFileHandler
from get_args import args from get_args import args
from config import settings from config import settings
import six
logger = logging.getLogger() logger = logging.getLogger()
@ -107,10 +109,10 @@ class MyFilter(logging.Filter):
class ArgsFilteringFilter(logging.Filter): class ArgsFilteringFilter(logging.Filter):
def filter_args(self, record, func): def filter_args(self, record, func):
if isinstance(record.args, (types.ListType, types.TupleType)): if isinstance(record.args, (list, tuple)):
final_args = [] final_args = []
for arg in record.args: for arg in record.args:
if not isinstance(arg, basestring): if not isinstance(arg, six.string_types):
final_args.append(arg) final_args.append(arg)
continue continue
@ -118,7 +120,7 @@ class ArgsFilteringFilter(logging.Filter):
record.args = type(record.args)(final_args) record.args = type(record.args)(final_args)
elif isinstance(record.args, dict): elif isinstance(record.args, dict):
for key, arg in record.args.items(): for key, arg in record.args.items():
if not isinstance(arg, basestring): if not isinstance(arg, six.string_types):
continue continue
record.args[key] = func(arg) record.args[key] = func(arg)

@ -1,5 +1,8 @@
# coding=utf-8 # coding=utf-8
import six
from six.moves import zip
from functools import reduce
bazarr_version = '0.8.2' bazarr_version = '0.8.2'
import gc import gc
@ -12,7 +15,7 @@ import pretty
import math import math
import ast import ast
import hashlib import hashlib
import urllib import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
import warnings import warnings
import queueconfig import queueconfig
import platform import platform
@ -1575,12 +1578,12 @@ def save_settings():
settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username') settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username')
settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password') settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password')
before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url), before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url),
unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')), six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')),
unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie)) six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie))
after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl), after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl),
unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr), six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr),
unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie)) six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie))
settings.general.ip = text_type(settings_general_ip) settings.general.ip = text_type(settings_general_ip)
settings.general.port = text_type(settings_general_port) settings.general.port = text_type(settings_general_port)
@ -1645,7 +1648,7 @@ def save_settings():
settings_proxy_password = request.forms.get('settings_proxy_password') settings_proxy_password = request.forms.get('settings_proxy_password')
settings_proxy_exclude = request.forms.get('settings_proxy_exclude') settings_proxy_exclude = request.forms.get('settings_proxy_exclude')
before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude)) before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude))
if before_proxy_password[0] != settings_proxy_type: if before_proxy_password[0] != settings_proxy_type:
configured() configured()
if before_proxy_password[1] == settings_proxy_password: if before_proxy_password[1] == settings_proxy_password:
@ -2029,7 +2032,7 @@ def remove_subtitles():
history_log(0, sonarrSeriesId, sonarrEpisodeId, result) history_log(0, sonarrSeriesId, sonarrEpisodeId, result)
except OSError as e: except OSError as e:
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
store_subtitles(unicode(episodePath)) store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId) list_missing_subtitles(sonarrSeriesId)
@ -2048,7 +2051,7 @@ def remove_subtitles_movie():
history_log_movie(0, radarrId, result) history_log_movie(0, radarrId, result)
except OSError as e: except OSError as e:
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
store_subtitles_movie(unicode(moviePath)) store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId) list_missing_subtitles_movies(radarrId)
@ -2082,7 +2085,7 @@ def get_subtitle():
score = result[4] score = result[4]
history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message) send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath)) store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId) list_missing_subtitles(sonarrSeriesId)
redirect(ref) redirect(ref)
except OSError: except OSError:
@ -2140,7 +2143,7 @@ def manual_get_subtitle():
score = result[4] score = result[4]
history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message) send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath)) store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId) list_missing_subtitles(sonarrSeriesId)
redirect(ref) redirect(ref)
except OSError: except OSError:
@ -2184,7 +2187,7 @@ def perform_manual_upload_subtitle():
score = 360 score = 360
history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
send_notifications(sonarrSeriesId, sonarrEpisodeId, message) send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
store_subtitles(unicode(episodePath)) store_subtitles(six.text_type(episodePath))
list_missing_subtitles(sonarrSeriesId) list_missing_subtitles(sonarrSeriesId)
redirect(ref) redirect(ref)
@ -2221,7 +2224,7 @@ def get_subtitle_movie():
score = result[4] score = result[4]
history_log_movie(1, radarrId, message, path, language_code, provider, score) history_log_movie(1, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message) send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath)) store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId) list_missing_subtitles_movies(radarrId)
redirect(ref) redirect(ref)
except OSError: except OSError:
@ -2277,7 +2280,7 @@ def manual_get_subtitle_movie():
score = result[4] score = result[4]
history_log_movie(2, radarrId, message, path, language_code, provider, score) history_log_movie(2, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message) send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath)) store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId) list_missing_subtitles_movies(radarrId)
redirect(ref) redirect(ref)
except OSError: except OSError:
@ -2320,7 +2323,7 @@ def perform_manual_upload_subtitle_movie():
score = 120 score = 120
history_log_movie(4, radarrId, message, path, language_code, provider, score) history_log_movie(4, radarrId, message, path, language_code, provider, score)
send_notifications_movie(radarrId, message) send_notifications_movie(radarrId, message)
store_subtitles_movie(unicode(moviePath)) store_subtitles_movie(six.text_type(moviePath))
list_missing_subtitles_movies(radarrId) list_missing_subtitles_movies(radarrId)
redirect(ref) redirect(ref)
@ -2421,7 +2424,7 @@ def api_history():
@route(base_url + 'test_url/<protocol>/<url:path>', method='GET') @route(base_url + 'test_url/<protocol>/<url:path>', method='GET')
@custom_auth_basic(check_credentials) @custom_auth_basic(check_credentials)
def test_url(protocol, url): def test_url(protocol, url):
url = urllib.unquote(url) url = six.moves.urllib.parse.unquote(url)
try: try:
result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version'] result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version']
except: except:
@ -2433,7 +2436,7 @@ def test_url(protocol, url):
@route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET') @route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET')
@custom_auth_basic(check_credentials) @custom_auth_basic(check_credentials)
def test_notification(protocol, provider): def test_notification(protocol, provider):
provider = urllib.unquote(provider) provider = six.moves.urllib.parse.unquote(provider)
apobj = apprise.Apprise() apobj = apprise.Apprise()
apobj.add(protocol + "://" + provider) apobj.add(protocol + "://" + provider)

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import apprise import apprise
import os import os
import logging import logging

@ -1,3 +1,4 @@
from __future__ import absolute_import
from collections import deque from collections import deque
import json import json

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
from get_episodes import sync_episodes, update_all_episodes from get_episodes import sync_episodes, update_all_episodes
from get_movies import update_movies, update_all_movies from get_movies import update_movies, update_all_movies
from get_series import update_series from get_series import update_series

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import
import os import os
import time import time
import platform import platform

@ -1,797 +0,0 @@
"""Configuration file parser.
A setup file consists of sections, lead by a "[section]" header,
and followed by "name: value" entries, with continuations and such in
the style of RFC 822.
The option values can contain format strings which refer to other values in
the same section, or values in a special [DEFAULT] section.
For example:
something: %(dir)s/whatever
would resolve the "%(dir)s" to the value of dir. All reference
expansions are done late, on demand.
Intrinsic defaults can be specified by passing them into the
ConfigParser constructor as a dictionary.
class:
ConfigParser -- responsible for parsing a list of
configuration files, and managing the parsed database.
methods:
__init__(defaults=None)
create the parser and specify a dictionary of intrinsic defaults. The
keys must be strings, the values must be appropriate for %()s string
interpolation. Note that `__name__' is always an intrinsic default;
its value is the section's name.
sections()
return all the configuration section names, sans DEFAULT
has_section(section)
return whether the given section exists
has_option(section, option)
return whether the given option exists in the given section
options(section)
return list of configuration options for the named section
read(filenames)
read and parse the list of named configuration files, given by
name. A single filename is also allowed. Non-existing files
are ignored. Return list of successfully read files.
readfp(fp, filename=None)
read and parse one configuration file, given as a file object.
The filename defaults to fp.name; it is only used in error
messages (if fp has no `name' attribute, the string `<???>' is used).
get(section, option, raw=False, vars=None)
return a string value for the named option. All % interpolations are
expanded in the return values, based on the defaults passed into the
constructor and the DEFAULT section. Additional substitutions may be
provided using the `vars' argument, which must be a dictionary whose
contents override any pre-existing defaults.
getint(section, options)
like get(), but convert value to an integer
getfloat(section, options)
like get(), but convert value to a float
getboolean(section, options)
like get(), but convert value to a boolean (currently case
insensitively defined as 0, false, no, off for False, and 1, true,
yes, on for True). Returns False or True.
items(section, raw=False, vars=None)
return a list of tuples with (name, value) for each option
in the section.
remove_section(section)
remove the given file section and all its options
remove_option(section, option)
remove the given option from the given section
set(section, option, value)
set the given option
write(fp)
write the configuration state in .ini format
"""
try:
from collections import OrderedDict as _default_dict
except ImportError:
# fallback for setup.py which hasn't yet built _collections
_default_dict = dict
import re
__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError",
"InterpolationError", "InterpolationDepthError",
"InterpolationSyntaxError", "ParsingError",
"MissingSectionHeaderError",
"ConfigParser", "SafeConfigParser", "RawConfigParser",
"DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"]
DEFAULTSECT = "DEFAULT"
MAX_INTERPOLATION_DEPTH = 10
# exception classes
class Error(Exception):
"""Base class for ConfigParser exceptions."""
def _get_message(self):
"""Getter for 'message'; needed only to override deprecation in
BaseException."""
return self.__message
def _set_message(self, value):
"""Setter for 'message'; needed only to override deprecation in
BaseException."""
self.__message = value
# BaseException.message has been deprecated since Python 2.6. To prevent
# DeprecationWarning from popping up over this pre-existing attribute, use
# a new property that takes lookup precedence.
message = property(_get_message, _set_message)
def __init__(self, msg=''):
self.message = msg
Exception.__init__(self, msg)
def __repr__(self):
return self.message
__str__ = __repr__
class NoSectionError(Error):
"""Raised when no section matches a requested option."""
def __init__(self, section):
Error.__init__(self, 'No section: %r' % (section,))
self.section = section
self.args = (section, )
class DuplicateSectionError(Error):
"""Raised when a section is multiply-created."""
def __init__(self, section):
Error.__init__(self, "Section %r already exists" % section)
self.section = section
self.args = (section, )
class NoOptionError(Error):
"""A requested option was not found."""
def __init__(self, option, section):
Error.__init__(self, "No option %r in section: %r" %
(option, section))
self.option = option
self.section = section
self.args = (option, section)
class InterpolationError(Error):
"""Base class for interpolation-related exceptions."""
def __init__(self, option, section, msg):
Error.__init__(self, msg)
self.option = option
self.section = section
self.args = (option, section, msg)
class InterpolationMissingOptionError(InterpolationError):
"""A string substitution required a setting which was not available."""
def __init__(self, option, section, rawval, reference):
msg = ("Bad value substitution:\n"
"\tsection: [%s]\n"
"\toption : %s\n"
"\tkey : %s\n"
"\trawval : %s\n"
% (section, option, reference, rawval))
InterpolationError.__init__(self, option, section, msg)
self.reference = reference
self.args = (option, section, rawval, reference)
class InterpolationSyntaxError(InterpolationError):
"""Raised when the source text into which substitutions are made
does not conform to the required syntax."""
class InterpolationDepthError(InterpolationError):
"""Raised when substitutions are nested too deeply."""
def __init__(self, option, section, rawval):
msg = ("Value interpolation too deeply recursive:\n"
"\tsection: [%s]\n"
"\toption : %s\n"
"\trawval : %s\n"
% (section, option, rawval))
InterpolationError.__init__(self, option, section, msg)
self.args = (option, section, rawval)
class ParsingError(Error):
"""Raised when a configuration file does not follow legal syntax."""
def __init__(self, filename):
Error.__init__(self, 'File contains parsing errors: %s' % filename)
self.filename = filename
self.errors = []
self.args = (filename, )
def append(self, lineno, line):
self.errors.append((lineno, line))
self.message += '\n\t[line %2d]: %s' % (lineno, line)
class MissingSectionHeaderError(ParsingError):
"""Raised when a key-value pair is found before any section header."""
def __init__(self, filename, lineno, line):
Error.__init__(
self,
'File contains no section headers.\nfile: %s, line: %d\n%r' %
(filename, lineno, line))
self.filename = filename
self.lineno = lineno
self.line = line
self.args = (filename, lineno, line)
class RawConfigParser:
def __init__(self, defaults=None, dict_type=_default_dict,
allow_no_value=False):
self._dict = dict_type
self._sections = self._dict()
self._defaults = self._dict()
if allow_no_value:
self._optcre = self.OPTCRE_NV
else:
self._optcre = self.OPTCRE
if defaults:
for key, value in defaults.items():
self._defaults[self.optionxform(key)] = value
self.comment_store = None ## used for storing comments in ini
def defaults(self):
return self._defaults
def sections(self):
"""Return a list of section names, excluding [DEFAULT]"""
# self._sections will never have [DEFAULT] in it
return self._sections.keys()
def add_section(self, section):
"""Create a new section in the configuration.
Raise DuplicateSectionError if a section by the specified name
already exists. Raise ValueError if name is DEFAULT or any of it's
case-insensitive variants.
"""
if section.lower() == "default":
raise ValueError, 'Invalid section name: %s' % section
if section in self._sections:
raise DuplicateSectionError(section)
self._sections[section] = self._dict()
def has_section(self, section):
"""Indicate whether the named section is present in the configuration.
The DEFAULT section is not acknowledged.
"""
return section in self._sections
def options(self, section):
"""Return a list of option names for the given section name."""
try:
opts = self._sections[section].copy()
except KeyError:
raise NoSectionError(section)
opts.update(self._defaults)
if '__name__' in opts:
del opts['__name__']
return opts.keys()
def read(self, filenames):
"""Read and parse a filename or a list of filenames.
Files that cannot be opened are silently ignored; this is
designed so that you can specify a list of potential
configuration file locations (e.g. current directory, user's
home directory, systemwide directory), and all existing
configuration files in the list will be read. A single
filename may also be given.
Return list of successfully read files.
"""
if isinstance(filenames, basestring):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
fp = open(filename)
except IOError:
continue
self._read(fp, filename)
fp.close()
read_ok.append(filename)
return read_ok
def readfp(self, fp, filename=None):
"""Like read() but the argument must be a file-like object.
The `fp' argument must have a `readline' method. Optional
second argument is the `filename', which if not given, is
taken from fp.name. If fp has no `name' attribute, `<???>' is
used.
"""
if filename is None:
try:
filename = fp.name
except AttributeError:
filename = '<???>'
self._read(fp, filename)
def get(self, section, option):
opt = self.optionxform(option)
if section not in self._sections:
if section != DEFAULTSECT:
raise NoSectionError(section)
if opt in self._defaults:
return self._defaults[opt]
else:
raise NoOptionError(option, section)
elif opt in self._sections[section]:
return self._sections[section][opt]
elif opt in self._defaults:
return self._defaults[opt]
else:
raise NoOptionError(option, section)
def items(self, section):
try:
d2 = self._sections[section]
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
d2 = self._dict()
d = self._defaults.copy()
d.update(d2)
if "__name__" in d:
del d["__name__"]
return d.items()
def _get(self, section, conv, option):
return conv(self.get(section, option))
def getint(self, section, option):
return self._get(section, int, option)
def getfloat(self, section, option):
return self._get(section, float, option)
_boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True,
'0': False, 'no': False, 'false': False, 'off': False}
def getboolean(self, section, option):
v = self.get(section, option)
if v.lower() not in self._boolean_states:
raise ValueError, 'Not a boolean: %s' % v
return self._boolean_states[v.lower()]
def optionxform(self, optionstr):
return optionstr.lower()
def has_option(self, section, option):
"""Check for the existence of a given option in a given section."""
if not section or section == DEFAULTSECT:
option = self.optionxform(option)
return option in self._defaults
elif section not in self._sections:
return False
else:
option = self.optionxform(option)
return (option in self._sections[section]
or option in self._defaults)
def set(self, section, option, value=None):
"""Set an option."""
if not section or section == DEFAULTSECT:
sectdict = self._defaults
else:
try:
sectdict = self._sections[section]
except KeyError:
raise NoSectionError(section)
sectdict[self.optionxform(option)] = value
def write(self, fp):
"""Write an .ini-format representation of the configuration state."""
if self._defaults:
fp.write("[%s]\n" % DEFAULTSECT)
for (key, value) in self._defaults.items():
fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t')))
fp.write("\n")
for section in self._sections:
fp.write("[%s]\n" % section)
for (key, value) in self._sections[section].items():
if key == "__name__":
continue
if (value is not None) or (self._optcre == self.OPTCRE):
key = " = ".join((key, str(value).replace('\n', '\n\t')))
fp.write("%s\n" % (key))
fp.write("\n")
def remove_option(self, section, option):
"""Remove an option."""
if not section or section == DEFAULTSECT:
sectdict = self._defaults
else:
try:
sectdict = self._sections[section]
except KeyError:
raise NoSectionError(section)
option = self.optionxform(option)
existed = option in sectdict
if existed:
del sectdict[option]
return existed
def remove_section(self, section):
"""Remove a file section."""
existed = section in self._sections
if existed:
del self._sections[section]
return existed
#
# Regular expressions for parsing section headers and options.
#
SECTCRE = re.compile(
r'\[' # [
r'(?P<header>[^]]+)' # very permissive!
r'\]' # ]
)
OPTCRE = re.compile(
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
r'\s*(?P<vi>[:=])\s*' # any number of space/tab,
# followed by separator
# (either : or =), followed
# by any # space/tab
r'(?P<value>.*)$' # everything up to eol
)
OPTCRE_NV = re.compile(
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
r'\s*(?:' # any number of space/tab,
r'(?P<vi>[:=])\s*' # optionally followed by
# separator (either : or
# =), followed by any #
# space/tab
r'(?P<value>.*))?$' # everything up to eol
)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
comment_store = {}
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' :
continue
### store comments for doc purposes
### Deal with cases of sections and options being there or not
if line[0] in '#;' and cursect is not None:
if optname is None:
comment_store.setdefault(cursect['__name__'] +
"::" + "global",[]).append(line)
else:
comment_store.setdefault(cursect['__name__'] +
"::" + optname,[]).append(line)
continue
elif line[0] in '#;' and cursect is None:
comment_store.setdefault("global" +
"::" + optname,[]).append(line)
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname].append(value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self._optcre.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
optname = self.optionxform(optname.rstrip())
# This check is fine because the OPTCRE cannot
# match if it would set optval to None
if optval is not None:
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos-1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
cursect[optname] = [optval]
else:
# valueless option handling
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
# join the multi-line values collected while reading
all_sections = [self._defaults]
all_sections.extend(self._sections.values())
for options in all_sections:
for name, val in options.items():
if isinstance(val, list):
options[name] = '\n'.join(val)
self.comment_store = comment_store
def ini_as_rst(self):
"""trivial helper function to putput comment_stroe as rest
.. todo:: write actual doctests with string input
>> p = ConfigParser2.SafeConfigParser()
>> p.read(f)
['/usr/home/pbrian/src/public/configparser2/example.ini']
>> open("/tmp/foo.rst", "w").write(p.ini_as_rst())
"""
outstr = ".. rst version of ini file\n\n"
_cursectname = None
for item in sorted(self.comment_store.keys()):
_sect, _opt = item.split("::")
if _sect != _cursectname:
outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect))
_cursectname = _sect
txt = " ".join(self.comment_store[item])
txt = txt.replace("#", "").replace(";","")
outstr += ":%s: %s" % (_opt, txt)
return outstr
import UserDict as _UserDict
class _Chainmap(_UserDict.DictMixin):
"""Combine multiple mappings for successive lookups.
For example, to emulate Python's normal lookup sequence:
import __builtin__
pylookup = _Chainmap(locals(), globals(), vars(__builtin__))
"""
def __init__(self, *maps):
self._maps = maps
def __getitem__(self, key):
for mapping in self._maps:
try:
return mapping[key]
except KeyError:
pass
raise KeyError(key)
def keys(self):
result = []
seen = set()
for mapping in self._maps:
for key in mapping:
if key not in seen:
result.append(key)
seen.add(key)
return result
class ConfigParser(RawConfigParser):
def get(self, section, option, raw=False, vars=None):
"""Get an option value for a given section.
If `vars' is provided, it must be a dictionary. The option is looked up
in `vars' (if provided), `section', and in `defaults' in that order.
All % interpolations are expanded in the return values, unless the
optional argument `raw' is true. Values for interpolation keys are
looked up in the same manner as the option.
The section DEFAULT is special.
"""
sectiondict = {}
try:
sectiondict = self._sections[section]
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
# Update with the entry specific variables
vardict = {}
if vars:
for key, value in vars.items():
vardict[self.optionxform(key)] = value
d = _Chainmap(vardict, sectiondict, self._defaults)
option = self.optionxform(option)
try:
value = d[option]
except KeyError:
raise NoOptionError(option, section)
if raw or value is None:
return value
else:
return self._interpolate(section, option, value, d)
def items(self, section, raw=False, vars=None):
"""Return a list of tuples with (name, value) for each option
in the section.
All % interpolations are expanded in the return values, based on the
defaults passed into the constructor, unless the optional argument
`raw' is true. Additional substitutions may be provided using the
`vars' argument, which must be a dictionary whose contents overrides
any pre-existing defaults.
The section DEFAULT is special.
"""
d = self._defaults.copy()
try:
d.update(self._sections[section])
except KeyError:
if section != DEFAULTSECT:
raise NoSectionError(section)
# Update with the entry specific variables
if vars:
for key, value in vars.items():
d[self.optionxform(key)] = value
options = d.keys()
if "__name__" in options:
options.remove("__name__")
if raw:
return [(option, d[option])
for option in options]
else:
return [(option, self._interpolate(section, option, d[option], d))
for option in options]
def _interpolate(self, section, option, rawval, vars):
# do the string interpolation
value = rawval
depth = MAX_INTERPOLATION_DEPTH
while depth: # Loop through this until it's done
depth -= 1
if value and "%(" in value:
value = self._KEYCRE.sub(self._interpolation_replace, value)
try:
value = value % vars
except KeyError, e:
raise InterpolationMissingOptionError(
option, section, rawval, e.args[0])
else:
break
if value and "%(" in value:
raise InterpolationDepthError(option, section, rawval)
return value
_KEYCRE = re.compile(r"%\(([^)]*)\)s|.")
def _interpolation_replace(self, match):
s = match.group(1)
if s is None:
return match.group()
else:
return "%%(%s)s" % self.optionxform(s)
class SafeConfigParser(ConfigParser):
def _interpolate(self, section, option, rawval, vars):
# do the string interpolation
L = []
self._interpolate_some(option, L, rawval, section, vars, 1)
return ''.join(L)
_interpvar_re = re.compile(r"%\(([^)]+)\)s")
def _interpolate_some(self, option, accum, rest, section, map, depth):
if depth > MAX_INTERPOLATION_DEPTH:
raise InterpolationDepthError(option, section, rest)
while rest:
p = rest.find("%")
if p < 0:
accum.append(rest)
return
if p > 0:
accum.append(rest[:p])
rest = rest[p:]
# p is no longer used
c = rest[1:2]
if c == "%":
accum.append("%")
rest = rest[2:]
elif c == "(":
m = self._interpvar_re.match(rest)
if m is None:
raise InterpolationSyntaxError(option, section,
"bad interpolation variable reference %r" % rest)
var = self.optionxform(m.group(1))
rest = rest[m.end():]
try:
v = map[var]
except KeyError:
raise InterpolationMissingOptionError(
option, section, rest, var)
if "%" in v:
self._interpolate_some(option, accum, v,
section, map, depth + 1)
else:
accum.append(v)
else:
raise InterpolationSyntaxError(
option, section,
"'%%' must be followed by '%%' or '(', found: %r" % (rest,))
def set(self, section, option, value=None):
"""Set an option. Extend ConfigParser.set: check for string values."""
# The only legal non-string value if we allow valueless
# options is None, so we need to check if the value is a
# string if:
# - we do not allow valueless options, or
# - we allow valueless options but the value is not None
if self._optcre is self.OPTCRE or value:
if not isinstance(value, basestring):
raise TypeError("option values must be strings")
if value is not None:
# check for bad percent signs:
# first, replace all "good" interpolations
tmp_value = value.replace('%%', '')
tmp_value = self._interpvar_re.sub('', tmp_value)
# then, check if there's a lone percent sign left
if '%' in tmp_value:
raise ValueError("invalid interpolation syntax in %r at "
"position %d" % (value, tmp_value.find('%')))
ConfigParser.set(self, section, option, value)

@ -1,43 +0,0 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen

@ -1,27 +0,0 @@
Beautiful Soup is made available under the MIT license:
Copyright (c) 2004-2015 Leonard Richardson
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Beautiful Soup incorporates code from the html5lib library, which is
also made available under the MIT license. Copyright (c) 2006-2013
James Graham and other contributors

File diff suppressed because it is too large Load Diff

@ -1,63 +0,0 @@
= Introduction =
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
<body>
<p>
Some
<b>
bad
<i>
HTML
</i>
</b>
</p>
</body>
</html>
>>> soup.find(text="bad")
u'bad'
>>> soup.i
<i>HTML</i>
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
>>> print soup.prettify()
<?xml version="1.0" encoding="utf-8">
<tag1>
Some
<tag2 />
bad
<tag3>
XML
</tag3>
</tag1>
= Full documentation =
The bs4/doc/ directory contains full documentation in Sphinx
format. Run "make html" in that directory to create HTML
documentation.
= Running the unit tests =
Beautiful Soup supports unit test discovery from the project root directory:
$ nosetests
$ python -m unittest discover -s bs4 # Python 2.7 and up
If you checked out the source tree, you should see a script in the
home directory called test-all-versions. This script will run the unit
tests under Python 2.7, then create a temporary Python 3 conversion of
the source and run the unit tests again under Python 3.
= Links =
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://readthedocs.org/docs/beautiful-soup-4/
Discussion group: http://groups.google.com/group/beautifulsoup/
Development: https://code.launchpad.net/beautifulsoup/
Bug tracker: https://bugs.launchpad.net/beautifulsoup/

@ -1,31 +0,0 @@
Additions
---------
More of the jQuery API: nextUntil?
Optimizations
-------------
The html5lib tree builder doesn't use the standard tree-building API,
which worries me and has resulted in a number of bugs.
markup_attr_map can be optimized since it's always a map now.
Upon encountering UTF-16LE data or some other uncommon serialization
of Unicode, UnicodeDammit will convert the data to Unicode, then
encode it at UTF-8. This is wasteful because it will just get decoded
back to Unicode.
CDATA
-----
The elementtree XMLParser has a strip_cdata argument that, when set to
False, should allow Beautiful Soup to preserve CDATA sections instead
of treating them as text. Except it doesn't. (This argument is also
present for HTMLParser, and also does nothing there.)
Currently, htm5lib converts CDATA sections into comments. An
as-yet-unreleased version of html5lib changes the parser's handling of
CDATA sections to allow CDATA sections in tags like <svg> and
<math>. The HTML5TreeBuilder will need to be updated to create CData
objects instead of Comment objects in this situation.

@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.0" __version__ = "4.8.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
import os import os
import re import re
import sys
import traceback import traceback
import warnings import warnings
@ -50,7 +49,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is # The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it. # running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """
@ -74,7 +73,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then like HTML's <br> tag), call handle_starttag and then
handle_endtag. handle_endtag.
""" """
ROOT_TAG_NAME = u'[document]' ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
@ -82,16 +81,56 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs): **kwargs):
"""The Soup object is initialized as the 'root tag', and the """Constructor.
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.""" :param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
"""
if 'convertEntities' in kwargs: if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn( warnings.warn(
"BS4 does not respect the convertEntities argument to the " "BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted " "BeautifulSoup constructor. Entities are always converted "
@ -142,18 +181,22 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument( from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode): if from_encoding and isinstance(markup, str):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None from_encoding = None
if len(kwargs) > 0: # We need this information to track whether or not the builder
arg = kwargs.keys().pop() # was specified well enough that we can omit the 'you need to
raise TypeError( # specify a parser' warning.
"__init__() got an unexpected keyword argument '%s'" % arg) original_builder = builder
original_features = features
if builder is None:
original_features = features if isinstance(builder, type):
if isinstance(features, basestring): # A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None:
if isinstance(features, str):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES features = self.DEFAULT_BUILDER_FEATURES
@ -163,41 +206,73 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you " "Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?" "requested: %s. Do you need to install a parser library?"
% ",".join(features)) % ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or # At this point either we have a TreeBuilder instance in
original_features in builder.ALTERNATE_NAMES): # builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None:
builder = builder_class(**kwargs)
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
if builder.is_xml: if builder.is_xml:
markup_type = "XML" markup_type = "XML"
else: else:
markup_type = "HTML" markup_type = "HTML"
caller = traceback.extract_stack()[0] # This code adapted from warnings.py so that we get the same line
filename = caller[0] # of code as our warnings.warn() call gets, even if the answer is wrong
line_number = caller[1] # (as it may be in a multithreading situation).
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( caller = None
filename=filename, try:
line_number=line_number, caller = sys._getframe(1)
parser=builder.NAME, except ValueError:
markup_type=markup_type)) pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.known_xml = self.is_xml self.known_xml = self.is_xml
self.builder.soup = self self._namespaces = dict()
self.parse_only = parse_only self.parse_only = parse_only
self.builder.initialize_soup(self)
if hasattr(markup, 'read'): # It's a file-type object. if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read() markup = markup.read()
elif len(markup) <= 256 and ( elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup) (isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup) or (isinstance(markup, str) and not '<' in markup)
): ):
# Print out warnings for a couple beginner problems # Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants. # just in case that's what the user really wants.
if (isinstance(markup, unicode) if (isinstance(markup, str)
and not os.path.supports_unicode_filenames): and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8") possible_filename = markup.encode("utf8")
else: else:
@ -205,13 +280,13 @@ class BeautifulSoup(Tag):
is_file = False is_file = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
except Exception, e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
pass pass
if is_file: if is_file:
if isinstance(markup, unicode): if isinstance(markup, str):
markup = markup.encode("utf8") markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should' '"%s" looks like a filename, not markup. You should'
@ -263,9 +338,9 @@ class BeautifulSoup(Tag):
if isinstance(markup, bytes): if isinstance(markup, bytes):
space = b' ' space = b' '
cant_start_with = (b"http:", b"https:") cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode): elif isinstance(markup, str):
space = u' ' space = ' '
cant_start_with = (u"http:", u"https:") cant_start_with = ("http:", "https:")
else: else:
return return
@ -302,9 +377,10 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs): def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup.""" """Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs) kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
def new_string(self, s, subclass=NavigableString): def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""
@ -327,7 +403,7 @@ class BeautifulSoup(Tag):
def pushTag(self, tag): def pushTag(self, tag):
#print "Push", tag.name #print "Push", tag.name
if self.currentTag: if self.currentTag is not None:
self.currentTag.contents.append(tag) self.currentTag.contents.append(tag)
self.tagStack.append(tag) self.tagStack.append(tag)
self.currentTag = self.tagStack[-1] self.currentTag = self.tagStack[-1]
@ -336,7 +412,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString): def endData(self, containerClass=NavigableString):
if self.current_data: if self.current_data:
current_data = u''.join(self.current_data) current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space # nothing but ASCII spaces, replace it with a single space
# or newline. # or newline.
@ -366,60 +442,71 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Add an object to the parse tree."""
parent = parent or self.currentTag if parent is None:
previous_element = most_recent_element or self._most_recent_element parent = self.currentTag
if most_recent_element is not None:
previous_element = most_recent_element
else:
previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag): if isinstance(o, Tag):
next_element = o.next_element next_element = o.next_element
next_sibling = o.next_sibling next_sibling = o.next_sibling
previous_sibling = o.previous_sibling previous_sibling = o.previous_sibling
if not previous_element: if previous_element is None:
previous_element = o.previous_element previous_element = o.previous_element
fix = parent.next_element is not None
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o self._most_recent_element = o
parent.contents.append(o) parent.contents.append(o)
if parent.next_sibling: # Check if we are inserting into an already parsed node.
# This node is being inserted into an element that has if fix:
# already been parsed. Deal with any dangling references. self._linkage_fixer(parent)
index = len(parent.contents)-1
while index >= 0: def _linkage_fixer(self, el):
if parent.contents[index] is o: """Make sure linkage of this fragment is sound."""
break
index -= 1 first = el.contents[0]
else: child = el.contents[-1]
raise ValueError( descendant = child
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % ( if child is first and el.parent is not None:
o, parent # Parent should be linked to first child
) el.next_element = child
) # We are no longer linked to whatever this element is
if index == 0: prev_el = child.previous_element
previous_element = parent if prev_el is not None and prev_el is not el:
previous_sibling = None prev_el.next_element = None
else: # First child should be linked to the parent, and no previous siblings.
previous_element = previous_sibling = parent.contents[index-1] child.previous_element = el
if index == len(parent.contents)-1: child.previous_sibling = None
next_element = parent.next_sibling
next_sibling = None # We have no sibling as we've been appended as the last.
else: child.next_sibling = None
next_element = next_sibling = parent.contents[index+1]
# This index is a tag, dig deeper for a "last descendant"
o.previous_element = previous_element if isinstance(child, Tag) and child.contents:
if previous_element: descendant = child._last_descendant(False)
previous_element.next_element = o
o.next_element = next_element # As the final step, link last descendant. It should be linked
if next_element: # to the parent's next sibling (if found), else walk up the chain
next_element.previous_element = o # and find a parent with a sibling. It should have no next sibling.
o.next_sibling = next_sibling descendant.next_element = None
if next_sibling: descendant.next_sibling = None
next_sibling.previous_sibling = o target = el
o.previous_sibling = previous_sibling while True:
if previous_sibling: if target is None:
previous_sibling.next_sibling = o break
elif target.next_sibling is not None:
descendant.next_element = target.next_sibling
target.next_sibling.previous_element = child
break
target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
@ -465,7 +552,7 @@ class BeautifulSoup(Tag):
self.currentTag, self._most_recent_element) self.currentTag, self._most_recent_element)
if tag is None: if tag is None:
return tag return tag
if self._most_recent_element: if self._most_recent_element is not None:
self._most_recent_element.next_element = tag self._most_recent_element.next_element = tag
self._most_recent_element = tag self._most_recent_element = tag
self.pushTag(tag) self.pushTag(tag)
@ -490,9 +577,9 @@ class BeautifulSoup(Tag):
encoding_part = '' encoding_part = ''
if eventual_encoding != None: if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else: else:
prefix = u'' prefix = ''
if not pretty_print: if not pretty_print:
indent_level = None indent_level = None
else: else:
@ -526,4 +613,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print soup.prettify() print(soup.prettify())

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
from collections import defaultdict from collections import defaultdict
import itertools import itertools
@ -7,8 +7,7 @@ import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
HTMLAwareEntitySubstitution, nonwhitespace_re
whitespace_re
) )
__all__ = [ __all__ = [
@ -90,18 +89,46 @@ class TreeBuilder(object):
is_xml = False is_xml = False
picklable = False picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents. # tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or # A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {} DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
def __init__(self):
USE_DEFAULT = object()
def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags:
"""
self.soup = None self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
"""
self.soup = soup
def reset(self): def reset(self):
pass pass
@ -125,7 +152,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None: if self.empty_element_tags is None:
return True return True
return tag_name in self.empty_element_tags return tag_name in self.empty_element_tags
def feed(self, markup): def feed(self, markup):
raise NotImplementedError() raise NotImplementedError()
@ -160,14 +187,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', []) universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get( tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None) tag_name.lower(), None)
for attr in attrs.keys(): for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific): if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string # We have a "class"-type attribute whose string
# value is a whitespace-separated list of # value is a whitespace-separated list of
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, str):
values = whitespace_re.split(value) values = nonwhitespace_re.findall(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse # for the same tag when rearranging the parse
@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags. Such as which tags are empty-element tags.
""" """
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([ empty_element_tags = set([
# These are from HTML5. # These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from HTML4, removed in HTML5. # These are from earlier versions of HTML and are removed in HTML5.
'spacer', 'frame' 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
]) ])
# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines these attributes as containing a # The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is, # space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values, # class="foo bar" means that the 'class' attribute has two values,
@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into # encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be # a list of values if possible. Upon output, the list will be
# converted back into a string. # converted back into a string.
cdata_list_attributes = { DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'], "*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'], "a" : ['rel', 'rev'],
"link" : ['rel', 'rev'], "link" : ['rel', 'rev'],
@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"], "output" : ["for"],
} }
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag): def set_up_substitutions(self, tag):
# We are only interested in <meta> tags # We are only interested in <meta> tags
if tag.name != 'meta': if tag.name != 'meta':

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
@ -15,7 +15,7 @@ from bs4.builder import (
) )
from bs4.element import ( from bs4.element import (
NamespacedAttribute, NamespacedAttribute,
whitespace_re, nonwhitespace_re,
) )
import html5lib import html5lib
from html5lib.constants import ( from html5lib.constants import (
@ -33,7 +33,7 @@ try:
# Pre-0.99999999 # Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False new_html5lib = False
except ImportError, e: except ImportError as e:
# 0.99999999 and up # 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True new_html5lib = True
@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict() extra_kwargs = dict()
if not isinstance(markup, unicode): if not isinstance(markup, str):
if new_html5lib: if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding extra_kwargs['override_encoding'] = self.user_specified_encoding
else: else:
@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs) doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, str):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
else: else:
original_encoding = parser.tokenizer.stream.charEncoding[0] original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring): if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib # In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility # Encoding object. We want to use a string for compatibility
# with other tree builders. # with other tree builders.
@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
rv.append("|%s<%s>" % (' ' * indent, name)) rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs: if element.attrs:
attributes = [] attributes = []
for name, value in element.attrs.items(): for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute): if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name) name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list): if isinstance(value, list):
@ -199,14 +199,14 @@ class AttrList(object):
def __setitem__(self, name, value): def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element, # If this attribute is a multi-valued attribute for this element,
# turn its value into a list. # turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes list_attr = self.element.cdata_list_attributes
if (name in list_attr['*'] if (name in list_attr['*']
or (self.element.name in list_attr or (self.element.name in list_attr
and name in list_attr[self.element.name])): and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone # A node that is being cloned may have already undergone
# this procedure. # this procedure.
if not isinstance(value, list): if not isinstance(value, list):
value = whitespace_re.split(value) value = nonwhitespace_re.findall(value)
self.element[name] = value self.element[name] = value
def items(self): def items(self):
return list(self.attrs.items()) return list(self.attrs.items())
@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node): def appendChild(self, node):
string_child = child = None string_child = child = None
if isinstance(node, basestring): if isinstance(node, str):
# Some other piece of code decided to pass in a string # Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the # instead of creating a TextElement object to contain the
# string. # string.
@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
child = node.element child = node.element
node.parent = self node.parent = self
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, str) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString): and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string. # We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like # TODO This has O(n^2) performance, for input like
@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element self.soup._most_recent_element = new_element
else: else:
if isinstance(node, basestring): if isinstance(node, str):
# Create a brand new NavigableString from this string. # Create a brand new NavigableString from this string.
child = self.soup.new_string(node) child = self.soup.new_string(node)
@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values( self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes) self.name, attributes)
for name, value in attributes.items(): for name, value in list(attributes.items()):
self.element[name] = value self.element[name] = value
# The attributes may contain variables that need substitution. # The attributes may contain variables that need substitution.
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
first_child = to_append[0] first_child = to_append[0]
if new_parents_last_descendant: if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant first_child.previous_element = new_parents_last_descendant
else: else:
first_child.previous_element = new_parent_element first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant: if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child new_parents_last_descendant.next_element = first_child
else: else:
new_parent_element.next_element = first_child new_parent_element.next_element = first_child
if new_parents_last_child: if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the # Find the very last element being moved. It is now the
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element: if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure # TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's # how to get html5lib to go through this path, but it's
# just the other side of the previous line. # just the other side of the previous line.

@ -1,17 +1,18 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from HTMLParser import HTMLParser from html.parser import HTMLParser
try: try:
from HTMLParser import HTMLParseError from html.parser import HTMLParseError
except ImportError, e: except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be # HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder. # thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception): class HTMLParseError(Exception):
@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and # order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up. # will ignore, assuming they ever show up.
self.already_closed_empty_element = [] self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs): def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like # This is only called when the markup looks like
# <tag/>. # <tag/>.
@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else: else:
real_name = int(name) real_name = int(name)
try: data = None
data = unichr(real_name) if real_name < 256:
except (ValueError, OverflowError), e: # HTML numeric entities are supposed to reference Unicode
data = u"\N{REPLACEMENT CHARACTER}" # code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError as e:
pass
if not data:
try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
pass
data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
def handle_entityref(self, name): def handle_entityref(self, name):
@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None: if character is not None:
data = character data = character
else: else:
data = "&%s;" % name # If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data) self.handle_data(data)
def handle_comment(self, data): def handle_comment(self, data):
@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER NAME = HTMLPARSER
features = [NAME, HTML, STRICT] features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs): def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False parser_kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs) self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None): document_declared_encoding=None, exclude_encodings=None):
@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER). replaced with REPLACEMENT CHARACTER).
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
yield (markup, None, None, False) yield (markup, None, None, False)
return return
@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
except HTMLParseError, e: parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e raise e

@ -1,13 +1,18 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
] ]
try:
from collections.abc import Callable # Python 3.6
except ImportError as e:
from collections import Callable
from io import BytesIO from io import BytesIO
from StringIO import StringIO from io import StringIO
import collections
from lxml import etree from lxml import etree
from bs4.element import ( from bs4.element import (
Comment, Comment,
@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml' LXML = 'lxml'
def _invert(d):
"Invert a dictionary."
return dict((v,k) for k, v in list(d.items()))
class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser DEFAULT_PARSER_CLASS = etree.XMLParser
@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace # This namespace mapping is specified in the XML Namespace
# standard. # standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
def _register_namespaces(self, mapping):
"""Let the BeautifulSoup object know about namespaces encountered
while parsing the document.
This might be useful later on when creating CSS selectors.
"""
for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
def default_parser(self, encoding): def default_parser(self, encoding):
# This can either return a parser object or a class, which # This can either return a parser object or a class, which
@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser. # Use the default parser.
parser = self.default_parser(encoding) parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable): if isinstance(parser, Callable):
# Instantiate the parser with default arguments # Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding) parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser return parser
def __init__(self, parser=None, empty_element_tags=None): def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a # TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new # callable, since that means there's no way to create new
# parsers for different encodings. # parsers for different encodings.
@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None: if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags) self.empty_element_tags = set(empty_element_tags)
self.soup = None self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag): def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag # Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py. # name. Copied from lxml's src/lxml/sax.py.
@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else: else:
self.processing_instruction_class = XMLProcessingInstruction self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode): if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
yield markup, None, document_declared_encoding, False yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode): if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and # No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8. # tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup): def feed(self, markup):
if isinstance(markup, bytes): if isinstance(markup, bytes):
markup = BytesIO(markup) markup = BytesIO(markup)
elif isinstance(markup, unicode): elif isinstance(markup, str):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0: if len(data) != 0:
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def close(self): def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}): def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs) attrs = dict(attrs)
nsprefix = None nsprefix = None
# Invert each namespace map as it comes in. # Invert each namespace map as it comes in.
if len(self.nsmaps) > 1: if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but # There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a # non-default namespaces are in play, so we need a
# separate tag stack to know when they end. # separate tag stack to know when they end.
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap) # First, Let the BeautifulSoup object know about it.
self._register_namespaces(nsmap)
# Then, add it to our running list of inverted namespace
# mappings.
self.nsmaps.append(_invert(nsmap))
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later. # tag, so we can recreate it later.
attrs = attrs.copy() attrs = attrs.copy()
for prefix, namespace in nsmap.items(): for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute( attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace attrs[attribute] = namespace
@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and # from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects. # turn then into NamespacedAttribute objects.
new_attrs = {} new_attrs = {}
for attr, value in attrs.items(): for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr) namespace, attr = self._getNsTag(attr)
if namespace is None: if namespace is None:
new_attrs[attr] = value new_attrs[attr] = value
@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding) self.parser = self.parser_for(encoding)
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment return '<html><body>%s</body></html>' % fragment

@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import codecs import codecs
from htmlentitydefs import codepoint2name from html.entities import codepoint2name
import re import re
import logging import logging
import string import string
@ -46,9 +45,9 @@ except ImportError:
pass pass
xml_encoding_re = re.compile( xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile( html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object): class EntitySubstitution(object):
@ -58,15 +57,24 @@ class EntitySubstitution(object):
lookup = {} lookup = {}
reverse_lookup = {} reverse_lookup = {}
characters_for_re = [] characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint) # &apos is an XHTML entity and an HTML 5, but not an HTML 4
if codepoint != 34: # entity. We don't want to use it, but we want to recognize it on the way in.
#
# TODO: Ideally we would be able to recognize all HTML 5 named
# entities, but that's a little tricky.
extra = [(39, 'apos')]
for codepoint, name in list(codepoint2name.items()) + extra:
character = chr(codepoint)
if codepoint not in (34, 39):
# There's no point in turning the quotation mark into # There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which # &quot; or the single quote into &apos;, unless it
# is handled elsewhere. # happens within an attribute value, which is handled
# elsewhere.
characters_for_re.append(character) characters_for_re.append(character)
lookup[character] = name lookup[character] = name
# But we do want to turn &quot; into the quotation mark. # But we do want to recognize those entities on the way in and
# convert them to Unicode characters.
reverse_lookup[name] = character reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re) re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition) return lookup, reverse_lookup, re.compile(re_definition)
@ -82,7 +90,7 @@ class EntitySubstitution(object):
} }
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")") ")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])") AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@ -274,7 +282,7 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data): def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies.""" """If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None encoding = None
if isinstance(data, unicode): if isinstance(data, str):
# Unicode data cannot have a byte-order mark. # Unicode data cannot have a byte-order mark.
return data, encoding return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
@ -352,9 +360,9 @@ class UnicodeDammit:
markup, override_encodings, is_html, exclude_encodings) markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with. # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '': if isinstance(markup, str) or markup == '':
self.markup = markup self.markup = markup
self.unicode_markup = unicode(markup) self.unicode_markup = str(markup)
self.original_encoding = None self.original_encoding = None
return return
@ -438,7 +446,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"): def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode. '''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors) return str(data, encoding, errors)
@property @property
def declared_html_encoding(self): def declared_html_encoding(self):

@ -1,12 +1,11 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import cProfile import cProfile
from StringIO import StringIO from io import StringIO
from HTMLParser import HTMLParser from html.parser import HTMLParser
import bs4 import bs4
from bs4 import BeautifulSoup, __version__ from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from bs4.builder import builder_registry
@ -22,8 +21,8 @@ import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__ print("Diagnostic running on Beautiful Soup %s" % __version__)
print "Python version %s" % sys.version print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -32,16 +31,16 @@ def diagnose(data):
break break
else: else:
basic_parsers.remove(name) basic_parsers.remove(name)
print ( print((
"I noticed that %s is not installed. Installing it may help." % "I noticed that %s is not installed. Installing it may help." %
name) name))
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append("lxml-xml")
try: try:
from lxml import etree from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError, e: except ImportError as e:
print ( print (
"lxml is not installed or couldn't be imported.") "lxml is not installed or couldn't be imported.")
@ -49,37 +48,43 @@ def diagnose(data):
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
try: try:
import html5lib import html5lib
print "Found html5lib version %s" % html5lib.__version__ print("Found html5lib version %s" % html5lib.__version__)
except ImportError, e: except ImportError as e:
print ( print (
"html5lib is not installed or couldn't be imported.") "html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
print else:
try:
if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print()
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print("Trying to parse your markup with %s" % parser)
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, features=parser)
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "Here's what %s did with the markup:" % parser print("Here's what %s did with the markup:" % parser)
print soup.prettify() print(soup.prettify())
print "-" * 80 print("-" * 80)
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text)) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Announces HTMLParser parse events, without doing anything else."""
@ -149,7 +154,7 @@ def rword(length=5):
def rsentence(length=4): def rsentence(length=4):
"Generate a random sentence-like string." "Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length)) return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
def rdoc(num_elements=1000): def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document.""" """Randomly generate an invalid HTML document."""
@ -171,9 +176,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements) data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data) print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
b = time.time() b = time.time()
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a) print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a) print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):

File diff suppressed because it is too large Load Diff

@ -1,7 +1,7 @@
# encoding: utf-8
"""Helper classes for tests.""" """Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import pickle import pickle
@ -16,29 +16,66 @@ from bs4.element import (
ContentMetaAttributeValue, ContentMetaAttributeValue,
Doctype, Doctype,
SoupStrainer, SoupStrainer,
Tag
) )
from bs4.builder import HTMLParserTreeBuilder from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
<div>A <meta> tag</div>
<div>A <br> tag that supposedly has contents.</br></div>
<div>AT&T</div>
<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
<div><a href="http://example.com/</a> that attribute value never got closed</div>
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
<! This document starts with a bogus declaration ><div>a</div>
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
<div>This document ends with <!an incomplete declaration
<div><a style={height:21px;}>That attribute value was bogus</a></div>
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
<div>This document ends before the entity finishes: &gt
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
<div><table><tr><td>Here's a table</td></tr></table></div>
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
<div>This tag contains nothing but whitespace: <b> </b></div>
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
<div><table><div>This table contains bare markup</div></table></div>
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
"""
class SoupTest(unittest.TestCase): class SoupTest(unittest.TestCase):
@property @property
def default_builder(self): def default_builder(self):
return default_builder() return default_builder
def soup(self, markup, **kwargs): def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup.""" """Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder) builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs) return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup): def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document. """Turn an HTML fragment into a document.
The details depend on the builder. The details depend on the builder.
""" """
return self.default_builder.test_fragment_to_document(markup) return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None): def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder builder = self.default_builder
@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase):
self.assertEqual(earlier, e.previous_element) self.assertEqual(earlier, e.previous_element)
earlier = e earlier = e
def linkage_validator(self, el, _recursive_call=False):
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert el.previous_element is None,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert el.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert el.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert el.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert child.previous_element is el,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert child.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert child.previous_sibling is el.contents[idx - 1],\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert el.contents[idx - 1].next_sibling is child,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert child.previous_element is last_child,\
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert last_child.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
# A bubbled up descendant should have no next siblings
assert descendant.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert child.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target = el
while True:
if target is None:
assert child.next_element is None, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert child.next_element is target.next_sibling, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
class HTMLTreeBuilderSmokeTest(object): class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence. """A basic test of a treebuilder's competence.
@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("") soup = self.soup("")
new_tag = soup.new_tag(name) new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element) self.assertEqual(True, new_tag.is_empty_element)
def test_pickle_and_unpickle_identity(self): def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical # Pickling a tree, then unpickling it, yields a tree identical
# to the original. # to the original.
@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""), soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b"")) markup.replace(b"\n", b""))
def test_namespaced_html(self):
"""When a namespaced XML document is parsed as HTML it should
be treated as HTML with weird tag names.
"""
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
soup = self.soup(markup)
self.assertEqual(2, len(soup.find_all("ns1:foo")))
def test_processing_instruction(self): def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that # We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class # process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no # even when the markup is already Unicode and there is no
# need to process anything. # need to process anything.
markup = u"""<?PITarget PIContent?>""" markup = """<?PITarget PIContent?>"""
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(markup, soup.decode()) self.assertEqual(markup, soup.decode())
@ -292,6 +452,18 @@ Hello, world!
"<tbody><tr><td>Bar</td></tr></tbody>" "<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>") "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_multivalued_attribute_with_whitespace(self):
# Whitespace separating the values of a multi-valued attribute
# should be ignored.
markup = '<div class=" foo bar "></a>'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.div['class'])
# If you search by the literal name of the class it's like the whitespace
# wasn't there.
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
def test_deeply_nested_multivalued_attribute(self): def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times # html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with # as it rearranges the tree. This has caused problems with
@ -311,15 +483,41 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self): def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>') self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_strings_resembling_character_entity_references(self):
# "&T" and "&p" look like incomplete character entities, but they are
# not.
self.assertSoupEquals(
"<p>&bull; AT&T is in the s&p 500</p>",
"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
def test_apos_entity(self):
self.assertSoupEquals(
"<p>Bob&apos;s Bar</p>",
"<p>Bob's Bar</p>",
)
def test_entities_in_foreign_document_encoding(self):
# &#147; and &#148; are invalid numeric entities referencing
# Windows-1252 characters. &#45; references a character common
# to Windows-1252 and Unicode, and &#9731; references a
# character only found in Unicode.
#
# All of these entities should be converted to Unicode
# characters.
markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
soup = self.soup(markup)
self.assertEqual("“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self): def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self): def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect) self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -330,11 +528,11 @@ Hello, world!
'<p>I said "good day!"</p>') '<p>I said "good day!"</p>')
def test_out_of_range_entity(self): def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}" expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self): def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder." "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -408,9 +606,9 @@ Hello, world!
# A seemingly innocuous document... but it's in Unicode! And # A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the # it contains characters that can't be represented in the
# encoding found in the declaration! The horror! # encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self): def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers.""" """Parsers should be able to work with SoupStrainers."""
@ -450,7 +648,7 @@ Hello, world!
# Both XML and HTML entities are converted to Unicode characters # Both XML and HTML entities are converted to Unicode characters
# during parsing. # during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>" expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected) self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self): def test_smart_quotes_converted_on_the_way_in(self):
@ -460,15 +658,15 @@ Hello, world!
soup = self.soup(quote) soup = self.soup(quote)
self.assertEqual( self.assertEqual(
soup.p.string, soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self): def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>") soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self): def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8") expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text) soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected) self.assertEqual(soup.p.encode("utf-8"), expected)
@ -477,7 +675,7 @@ Hello, world!
# easy-to-understand document. # easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use # That's because we're going to encode it into ISO-Latin-1, and use
# that to test. # that to test.
@ -586,6 +784,13 @@ Hello, world!
data.a['foo'] = 'bar' data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode()) self.assertEqual('<a foo="bar">text</a>', data.a.decode())
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class XMLTreeBuilderSmokeTest(object): class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self): def test_pickle_and_unpickle_identity(self):
@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual( self.assertEqual(
soup.encode("utf-8"), markup) soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<parent xmlns="http://ns1/">
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
</child>
</parent>"""
soup = self.soup(doc)
self.assertEqual(doc, soup.encode())
def test_formatter_processes_script_tag_for_xml_documents(self): def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """ doc = """
<script type="text/javascript"> <script type="text/javascript">
@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded) self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self): def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self): def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
unicode(soup.rss), markup) str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self): def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self): def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup) self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self): def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self): def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>' markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_find_by_prefixed_name(self): def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?> doc = """<?xml version="1.0" encoding="utf-8"?>
@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object):
# The two tags have the same namespace prefix. # The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix) self.assertEqual(tag.prefix, duplicate.prefix)
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5.""" """Smoke test for a tree builder that supports HTML5."""

@ -5,7 +5,7 @@ import warnings
try: try:
from bs4.builder import HTML5TreeBuilder from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True HTML5LIB_PRESENT = True
except ImportError, e: except ImportError as e:
HTML5LIB_PRESENT = False HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer from bs4.element import SoupStrainer
from bs4.testing import ( from bs4.testing import (
@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
@property @property
def default_builder(self): def default_builder(self):
return HTML5TreeBuilder() return HTML5TreeBuilder
def test_soupstrainer(self): def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers. # The html5lib tree builder does not support SoupStrainers.
@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_reparented_markup(self): def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p'))) self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self): def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p'))) self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self): def test_reparented_markup_containing_identical_whitespace_nodes(self):
@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_foster_parenting(self): def test_foster_parenting(self):
markup = b"""<table><td></tbody>A""" markup = b"""<table><td></tbody>A"""
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
def test_extraction(self):
"""
Test that extraction does not destroy the tree.
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
"""
markup = """
<html><head></head>
<style>
</style><script></script><body><p>hello</p></body></html>
"""
soup = self.soup(markup)
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
self.assertEqual(len(soup.find_all("p")), 1)
def test_empty_comment(self):
"""
Test that empty comment does not break structure.
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
"""
markup = """
<html>
<body>
<form>
<!----><input type="text">
</form>
</body>
</html>
"""
soup = self.soup(markup)
inputs = []
for form in soup.find_all('form'):
inputs.extend(form.find_all('input'))
self.assertEqual(len(inputs), 1)

@ -5,12 +5,11 @@ from pdb import set_trace
import pickle import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property default_builder = HTMLParserTreeBuilder
def default_builder(self):
return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self): def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one. # html.parser can't handle namespaced doctypes, so skip this one.
@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self): def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "") self.assertSoupEquals('</br></br></br>', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")

@ -7,7 +7,7 @@ try:
import lxml.etree import lxml.etree
LXML_PRESENT = True LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
LXML_VERSION = (0,) LXML_VERSION = (0,)
@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property @property
def default_builder(self): def default_builder(self):
return LXMLTreeBuilder() return LXMLTreeBuilder
def test_out_of_range_entity(self): def test_out_of_range_entity(self):
self.assertSoupEquals( self.assertSoupEquals(
@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals( self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>") "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
# a string like u'\x93'
pass
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed. # test if an old version of lxml is installed.
@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed. # if one is installed.
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />") soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b)) self.assertEqual("<b/>", str(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf( @skipIf(
@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property @property
def default_builder(self): def default_builder(self):
return LXMLTreeBuilderForXML() return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
# We should not track un-prefixed namespaces as we can only hold one
# and it will be recognized as the default namespace by soupsieve,
# which may be confusing in some situations. When no namespace is provided
# for a selector, the default namespace (if defined) is assumed.
soup = self.soup(
'<?xml version="1.1"?>\n'
'<root>'
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
'</root>'
)
self.assertEqual(
soup._namespaces,
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
)

@ -24,6 +24,7 @@ from bs4.dammit import (
EncodingDetector, EncodingDetector,
) )
from bs4.testing import ( from bs4.testing import (
default_builder,
SoupTest, SoupTest,
skipIf, skipIf,
) )
@ -32,7 +33,7 @@ import warnings
try: try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True LXML_PRESENT = True
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest): class TestConstructor(SoupTest):
def test_short_unicode_input(self): def test_short_unicode_input(self):
data = u"<h1>éé</h1>" data = "<h1>éé</h1>"
soup = self.soup(data) soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string) self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self): def test_embedded_null(self):
data = u"<h1>foo\0bar</h1>" data = "<h1>foo\0bar</h1>"
soup = self.soup(data) soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string) self.assertEqual("foo\0bar", soup.h1.string)
def test_exclude_encodings(self): def test_exclude_encodings(self):
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding) self.assertEqual("windows-1252", soup.original_encoding)
def test_custom_builder_class(self):
# Verify that you can pass in a custom Builder class and
# it'll be instantiated with the appropriate keyword arguments.
class Mock(object):
def __init__(self, **kwargs):
self.called_with = kwargs
self.is_xml = True
def initialize_soup(self, soup):
pass
def prepare_markup(self, *args, **kwargs):
return ''
kwargs = dict(
var="value",
# This is a deprecated BS3-era keyword argument, which
# will be stripped out.
convertEntities=True,
)
with warnings.catch_warnings(record=True):
soup = BeautifulSoup('', builder=Mock, **kwargs)
assert isinstance(soup.builder, Mock)
self.assertEqual(dict(var="value"), soup.builder.called_with)
# You can also instantiate the TreeBuilder yourself. In this
# case, that specific object is used and any keyword arguments
# to the BeautifulSoup constructor are ignored.
builder = Mock(**kwargs)
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup(
'', builder=builder, ignored_value=True,
)
msg = str(w[0].message)
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
self.assertEqual(builder, soup.builder)
self.assertEqual(kwargs, builder.called_with)
def test_cdata_list_attributes(self):
# Most attribute values are represented as scalars, but the
# HTML standard says that some attributes, like 'class' have
# space-separated lists as values.
markup = '<a id=" an id " class=" a class "></a>'
soup = self.soup(markup)
# Note that the spaces are stripped for 'class' but not for 'id'.
a = soup.a
self.assertEqual(" an id ", a['id'])
self.assertEqual(["a", "class"], a['class'])
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
self.assertEqual(" a class ", soup.a['class'])
# Here are two ways of saying that `id` is a multi-valued
# attribute in this context, but 'class' is not.
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
a = soup.a
self.assertEqual(["an", "id"], a['id'])
self.assertEqual(" a class ", a['class'])
class TestWarnings(SoupTest): class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True): def _no_parser_specified(self, s, is_there=True):
@ -129,7 +195,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise # note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning # python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/") soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message) self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list)) for w in warning_list))
@ -141,7 +207,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_unicode_and_space(self): def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(u"http://www.crummyuncode.com/ is great") soup = self.soup("http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message) self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list)) for w in warning_list))
@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self): def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites # Unicode characters corresponding to named HTML entites
# are substituted, and no others. # are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar" s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s), self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar") "foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self): def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we # MS smart quotes are a common source of frustration, so we
@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual( self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"), self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T") "&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self): def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values.""" """There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"' text = 'Bob\'s "bar"'
@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self): def setUp(self):
super(TestEncodingConversion, self).setUp() super(TestEncodingConversion, self).setUp()
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8") self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like. # Just so you know what it looks like.
self.assertEqual( self.assertEqual(
@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>" ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii) soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode() unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode)) self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally: finally:
@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest):
# is not set. # is not set.
soup_from_unicode = self.soup(self.unicode_data) soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None) self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self): def test_utf8_in_unicode_out(self):
@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set. # attribute is set.
soup_from_utf8 = self.soup(self.utf8_data) soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self): def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8. # The internal data structures can be encoded as UTF-8.
@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_3_PRE_3_2, PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self): def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase): class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit.""" """Standalone tests of UnicodeDammit."""
def test_unicode_input(self): def test_unicode_input(self):
markup = u"I'm already Unicode! \N{SNOWMAN}" markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup) dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup) self.assertEqual(dammit.unicode_markup, markup)
@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>" markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup) dammit = UnicodeDammit(markup)
self.assertEqual( self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self): def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>" markup = b"<foo>\x91\x92\x93\x94</foo>"
@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8) dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self): def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9" hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self): def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self): def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self): def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']: for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding]) dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self): def test_exclude_encodings(self):
# This is UTF-8. # This is UTF-8.
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is # But if we exclude UTF-8 from consideration, the guess is
# Windows-1252. # Windows-1252.
@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase):
detected = EncodingDetector( detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>') b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings) encodings = list(detected.encodings)
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self): def test_detect_html5_style_meta_tag(self):
@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc) dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters) self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup) self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser") soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters) self.assertTrue(soup.contains_replacement_characters)
@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped. # A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data) dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding) self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self): def test_detwingle(self):
# Here's a UTF8 document. # Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document. # Here's a Windows-1252 document.
windows_1252 = ( windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together. # Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8 doc = utf8 + windows_1252 + utf8
@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc) fixed = UnicodeDammit.detwingle(doc)
self.assertEqual( self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self): def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending # Each of these characters has a UTF-8 representation ending
@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte # Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed. # UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in ( for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
): ):
input = tricky_unicode_char.encode("utf8") input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93')) self.assertTrue(input.endswith(b'\x93'))

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods. """Tests for Beautiful Soup's tree traversal methods.
@ -26,6 +25,7 @@ from bs4.element import (
Comment, Comment,
Declaration, Declaration,
Doctype, Doctype,
Formatter,
NavigableString, NavigableString,
SoupStrainer, SoupStrainer,
Tag, Tag,
@ -71,13 +71,13 @@ class TestFind(TreeTest):
self.assertEqual(soup.find("b").string, "2") self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self): def test_unicode_text_find(self):
soup = self.soup(u'<h1>Räksmörgås</h1>') soup = self.soup('<h1>Räksmörgås</h1>')
self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
def test_unicode_attribute_find(self): def test_unicode_attribute_find(self):
soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>') soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
str(soup) str(soup)
self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
def test_find_everything(self): def test_find_everything(self):
@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
"""You can search the tree for text nodes.""" """You can search the tree for text nodes."""
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
# Exact match. # Exact match.
self.assertEqual(soup.find_all(string="bar"), [u"bar"]) self.assertEqual(soup.find_all(string="bar"), ["bar"])
self.assertEqual(soup.find_all(text="bar"), [u"bar"]) self.assertEqual(soup.find_all(text="bar"), ["bar"])
# Match any of a number of strings. # Match any of a number of strings.
self.assertEqual( self.assertEqual(
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
# Match a regular expression. # Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')), self.assertEqual(soup.find_all(text=re.compile('.*')),
[u"Foo", u"bar", u'\xbb']) ["Foo", "bar", '\xbb'])
# Match anything. # Match anything.
self.assertEqual(soup.find_all(text=True), self.assertEqual(soup.find_all(text=True),
[u"Foo", u"bar", u'\xbb']) ["Foo", "bar", '\xbb'])
def test_find_all_limit(self): def test_find_all_limit(self):
"""You can limit the number of items returned by find_all.""" """You can limit the number of items returned by find_all."""
@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
["Matching a.", "Matching b."]) ["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self): def test_find_all_by_utf8_attribute_value(self):
peace = u"םולש".encode("utf8") peace = "םולש".encode("utf8")
data = u'<a title="םולש"></a>'.encode("utf8") data = '<a title="םולש"></a>'.encode("utf8")
soup = self.soup(data) soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
self.assertEqual([], soup.find_all(id=1, text="bar")) self.assertEqual([], soup.find_all(id=1, text="bar"))
class TestSmooth(TreeTest):
"""Test Tag.smooth."""
def test_smooth(self):
soup = self.soup("<div>a</div>")
div = soup.div
div.append("b")
div.append("c")
div.append(Comment("Comment 1"))
div.append(Comment("Comment 2"))
div.append("d")
builder = self.default_builder()
span = Tag(soup, builder, 'span')
span.append('1')
span.append('2')
div.append(span)
# At this point the tree has a bunch of adjacent
# NavigableStrings. This is normal, but it has no meaning in
# terms of HTML, so we may want to smooth things out for
# output.
# Since the <span> tag has two children, its .string is None.
self.assertEqual(None, div.span.string)
self.assertEqual(7, len(div.contents))
div.smooth()
self.assertEqual(5, len(div.contents))
# The three strings at the beginning of div.contents have been
# merged into on string.
#
self.assertEqual('abc', div.contents[0])
# The call is recursive -- the <span> tag was also smoothed.
self.assertEqual('12', div.span.string)
# The two comments have _not_ been merged, even though
# comments are strings. Merging comments would change the
# meaning of the HTML.
self.assertEqual('Comment 1', div.contents[1])
self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest): class TestIndex(TreeTest):
@ -605,7 +647,7 @@ class SiblingTest(TreeTest):
</html>''' </html>'''
# All that whitespace looks good but makes the tests more # All that whitespace looks good but makes the tests more
# difficult. Get rid of it. # difficult. Get rid of it.
markup = re.compile("\n\s*").sub("", markup) markup = re.compile(r"\n\s*").sub("", markup)
self.tree = self.soup(markup) self.tree = self.soup(markup)
@ -703,12 +745,12 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags.""" """Test the ability to create new tags."""
def test_new_tag(self): def test_new_tag(self):
soup = self.soup("") soup = self.soup("")
new_tag = soup.new_tag("foo", bar="baz") new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
self.assertTrue(isinstance(new_tag, Tag)) self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name) self.assertEqual("foo", new_tag.name)
self.assertEqual(dict(bar="baz"), new_tag.attrs) self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
self.assertEqual(None, new_tag.parent) self.assertEqual(None, new_tag.parent)
def test_tag_inherits_self_closing_rules_from_builder(self): def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT: if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml") xml_soup = BeautifulSoup("", "lxml-xml")
@ -821,6 +863,26 @@ class TestTreeModification(SoupTest):
soup = self.soup(text) soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a) self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
def test_insert_beautifulsoup_object_inserts_children(self):
"""Inserting one BeautifulSoup object into another actually inserts all
of its children -- you'll never combine BeautifulSoup objects.
"""
soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
text = "<p>p2</p><p>p3</p>"
to_insert = self.soup(text)
soup.insert(1, to_insert)
for i in soup.descendants:
assert not isinstance(i, BeautifulSoup)
p1, p2, p3, p4 = list(soup.children)
self.assertEqual("And now, a word:", p1.string)
self.assertEqual("p2", p2.string)
self.assertEqual("p3", p3.string)
self.assertEqual("And we're back.", p4.string)
def test_replace_with_maintains_next_element_throughout(self): def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('<p><a>one</a><b>three</b></p>') soup = self.soup('<p><a>one</a><b>three</b></p>')
a = soup.a a = soup.a
@ -877,7 +939,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.a.contents[0].next_element, "bar") self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self): def test_insert_tag(self):
builder = self.default_builder builder = self.default_builder()
soup = self.soup( soup = self.soup(
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
magic_tag = Tag(soup, builder, 'magictag') magic_tag = Tag(soup, builder, 'magictag')
@ -912,6 +974,13 @@ class TestTreeModification(SoupTest):
soup.a.append(soup.b) soup.a.append(soup.b)
self.assertEqual(data, soup.decode()) self.assertEqual(data, soup.decode())
def test_extend(self):
data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
soup = self.soup(data)
l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
soup.a.extend(l)
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
def test_move_tag_to_beginning_of_parent(self): def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>" data = "<a><b></b><c></c><d></d></a>"
soup = self.soup(data) soup = self.soup(data)
@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest):
self.assertEqual( self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
# Can't insert an element before itself.
b = soup.b
self.assertRaises(ValueError, b.insert_before, b)
# Can't insert before if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_before, "nope")
# Can insert an identical element
soup = self.soup("<a>")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_before(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_before("BAZ", " ", "QUUX")
soup.a.insert_before("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
soup.a.insert_before(soup.b, "FOO")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
def test_insert_after(self): def test_insert_after(self):
soup = self.soup("<a>foo</a><b>bar</b>") soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_after("BAZ") soup.b.insert_after("BAZ")
@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest):
self.assertEqual( self.assertEqual(
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
# Can't insert an element after itself.
b = soup.b
self.assertRaises(ValueError, b.insert_after, b)
# Can't insert after if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_after, "nope")
# Can insert an identical element
soup = self.soup("<a>")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_after(self):
soup = self.soup("<a>foo</a><b>bar</b>")
soup.b.insert_after("BAZ", " ", "QUUX")
soup.a.insert_after("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
soup.b.insert_after(soup.a, "FOO ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
def test_insert_after_raises_exception_if_after_has_no_meaning(self): def test_insert_after_raises_exception_if_after_has_no_meaning(self):
soup = self.soup("") soup = self.soup("")
tag = soup.new_tag("a") tag = soup.new_tag("a")
@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest):
<script>baz</script> <script>baz</script>
</html>""") </html>""")
[soup.script.extract() for i in soup.find_all("script")] [soup.script.extract() for i in soup.find_all("script")]
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body)) self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self): def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag tag = soup.bTag
self.assertEqual(soup.b, tag) self.assertEqual(soup.b, tag)
self.assertEqual( self.assertEqual(
'.bTag is deprecated, use .find("b") instead.', '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
str(w[0].message)) str(w[0].message))
def test_has_attr(self): def test_has_attr(self):
@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser') soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding encoding = soup.original_encoding
copy = soup.__copy__() copy = soup.__copy__()
self.assertEqual(u"<p> </p>", unicode(copy)) self.assertEqual("<p> </p>", str(copy))
self.assertEqual(encoding, copy.original_encoding) self.assertEqual(encoding, copy.original_encoding)
def test_unicode_pickle(self): def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled. # A tree containing Unicode characters can be pickled.
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped) loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode()) self.assertEqual(loaded.decode(), soup.decode())
def test_copy_navigablestring_is_not_attached_to_tree(self): def test_copy_navigablestring_is_not_attached_to_tree(self):
html = u"<b>Foo<a></a></b><b>Bar</b>" html = "<b>Foo<a></a></b><b>Bar</b>"
soup = self.soup(html) soup = self.soup(html)
s1 = soup.find(string="Foo") s1 = soup.find(string="Foo")
s2 = copy.copy(s1) s2 = copy.copy(s1)
@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest):
self.assertEqual(None, s2.previous_element) self.assertEqual(None, s2.previous_element)
def test_copy_navigablestring_subclass_has_same_type(self): def test_copy_navigablestring_subclass_has_same_type(self):
html = u"<b><!--Foo--></b>" html = "<b><!--Foo--></b>"
soup = self.soup(html) soup = self.soup(html)
s1 = soup.string s1 = soup.string
s2 = copy.copy(s1) s2 = copy.copy(s1)
@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest):
self.assertTrue(isinstance(s2, Comment)) self.assertTrue(isinstance(s2, Comment))
def test_copy_entire_soup(self): def test_copy_entire_soup(self):
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html) soup = self.soup(html)
soup_copy = copy.copy(soup) soup_copy = copy.copy(soup)
self.assertEqual(soup, soup_copy) self.assertEqual(soup, soup_copy)
def test_copy_tag_copies_contents(self): def test_copy_tag_copies_contents(self):
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html) soup = self.soup(html)
div = soup.div div = soup.div
div_copy = copy.copy(div) div_copy = copy.copy(div)
# The two tags look the same, and evaluate to equal. # The two tags look the same, and evaluate to equal.
self.assertEqual(unicode(div), unicode(div_copy)) self.assertEqual(str(div), str(div_copy))
self.assertEqual(div, div_copy) self.assertEqual(div, div_copy)
# But they're not the same object. # But they're not the same object.
@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest):
class TestSubstitutions(SoupTest): class TestSubstitutions(SoupTest):
def test_default_formatter_is_minimal(self): def test_default_formatter_is_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="minimal") decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone. # The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for( self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self): def test_formatter_html(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="html") decoded = soup.decode(formatter="html")
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>")) self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_html5(self):
markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html5")
self.assertEqual(
decoded,
self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_minimal(self): def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="minimal") decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone. # The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for( self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_null(self): def test_formatter_null(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter=None) decoded = soup.decode(formatter=None)
# Neither the angle brackets nor the e-with-acute are converted. # Neither the angle brackets nor the e-with-acute are converted.
# This is not valid HTML, but it's what the user wanted. # This is not valid HTML, but it's what the user wanted.
self.assertEqual(decoded, self.assertEqual(decoded,
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self): def test_formatter_custom(self):
markup = u"<b>&lt;foo&gt;</b><b>bar</b>" markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper()) decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom # Instead of normal entity conversion code, the custom
# callable is called on every string. # callable is called on every string.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for(u"<b><FOO></b><b>BAR</b>")) self.document_for("<b><FOO></b><b>BAR</b><br/>"))
def test_formatter_is_run_on_attribute_values(self): def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>' markup = '<a href="http://a.com?a=b&c=é">e</a>'
soup = self.soup(markup) soup = self.soup(markup)
a = soup.a a = soup.a
expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>' expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode())
self.assertEqual(expect_minimal, a.decode(formatter="minimal")) self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>' expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(expect_html, a.decode(formatter="html"))
self.assertEqual(markup, a.decode(formatter=None)) self.assertEqual(markup, a.decode(formatter=None))
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self): def test_formatter_skips_script_tag_for_html_documents(self):
@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest):
self.assertTrue(b"< < hey > >" in encoded) self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self): def test_prettify_leaves_preformatted_text_alone(self):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything # Everything outside the <pre> tag is reformatted, but everything
# inside is left alone. # inside is left alone.
self.assertEqual( self.assertEqual(
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
soup.div.prettify()) soup.div.prettify())
def test_prettify_accepts_formatter(self): def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper()) pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty) self.assertTrue("FOO" in pretty)
def test_prettify_outputs_unicode_by_default(self): def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")
self.assertEqual(unicode, type(soup.prettify())) self.assertEqual(str, type(soup.prettify()))
def test_prettify_can_encode_data(self): def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")
self.assertEqual(bytes, type(soup.prettify("utf-8"))) self.assertEqual(bytes, type(soup.prettify("utf-8")))
def test_html_entity_substitution_off_by_default(self): def test_html_entity_substitution_off_by_default(self):
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
soup = self.soup(markup) soup = self.soup(markup)
encoded = soup.b.encode("utf-8") encoded = soup.b.encode("utf-8")
self.assertEqual(encoded, markup.encode('utf-8')) self.assertEqual(encoded, markup.encode('utf-8'))
@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings.""" """Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self): def test_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(soup.b.string.encode("utf-8"), self.assertEqual(soup.b.string.encode("utf-8"),
u"\N{SNOWMAN}".encode("utf-8")) "\N{SNOWMAN}".encode("utf-8"))
def test_tag_containing_unicode_string_can_be_encoded(self): def test_tag_containing_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8")) soup.b.encode("utf-8"), html.encode("utf-8"))
def test_encoding_substitutes_unrecognized_characters_by_default(self): def test_encoding_substitutes_unrecognized_characters_by_default(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>") self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
def test_encoding_can_be_made_strict(self): def test_encoding_can_be_made_strict(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertRaises( self.assertRaises(
UnicodeEncodeError, soup.encode, "ascii", errors="strict") UnicodeEncodeError, soup.encode, "ascii", errors="strict")
def test_decode_contents(self): def test_decode_contents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
def test_encode_contents(self): def test_encode_contents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8")) encoding="utf8"))
def test_deprecated_renderContents(self): def test_deprecated_renderContents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
def test_repr(self): def test_repr(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
if PY3K: if PY3K:
self.assertEqual(html, repr(soup)) self.assertEqual(html, repr(soup))
else: else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup)) self.assertEqual(b'<b>\\u2603</b>', repr(soup))
class TestFormatter(SoupTest):
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
# e.g., disable the normal sorting of attributes.
class UnsortedFormatter(Formatter):
def attributes(self, tag):
self.called_with = tag
for k, v in sorted(tag.attrs.items()):
if k == 'ignore':
continue
yield k,v
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
# attributes() was called on the <p> tag. It filtered out one
# attribute and sorted the other two.
self.assertEqual(formatter.called_with, soup.p)
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
class TestNavigableStringSubclasses(SoupTest): class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self): def test_cdata(self):
@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('title') els = self.soup.select('title')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'title') self.assertEqual(els[0].name, 'title')
self.assertEqual(els[0].contents, [u'The title']) self.assertEqual(els[0].contents, ['The title'])
def test_one_tag_many(self): def test_one_tag_many(self):
els = self.soup.select('div') els = self.soup.select('div')
@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0) self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self): def test_invalid_tag(self):
self.assertRaises(ValueError, self.soup.select, 'tag%t') self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self): def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(dashed[0]['id'], 'dash2') self.assertEqual(dashed[0]['id'], 'dash2')
def test_dashed_tag_text(self): def test_dashed_tag_text(self):
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
def test_select_dashed_matches_find_all(self): def test_select_dashed_matches_find_all(self):
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass") NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises( self.assertRaises(
NotImplementedError, self.soup.select, "a:nth-of-type(a)") SyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self): def test_nth_of_type(self):
# Try to select first paragraph # Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)') els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text') self.assertEqual(els[0].string, 'Some text')
# Try to select third paragraph # Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)') els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another') self.assertEqual(els[0].string, 'Another')
# Try to select (non-existent!) fourth paragraph # Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)') els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0) self.assertEqual(len(els), 0)
# Pass in an invalid value. # Zero will select no tags.
self.assertRaises( els = self.soup.select('div p:nth-of-type(0)')
ValueError, self.soup.select, 'div p:nth-of-type(0)') self.assertEqual(len(els), 0)
def test_nth_of_type_direct_descendant(self): def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)') els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text') self.assertEqual(els[0].string, 'Some text')
def test_id_child_selector_nth_of_type(self): def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2')) self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self): def test_dangling_combinator(self):
self.assertRaises(ValueError, self.soup.select, 'h1 >') self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self): def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self): def test_invalid_multiple_select(self):
self.assertRaises(ValueError, self.soup.select, ',x, y') self.assertRaises(SyntaxError, self.soup.select, ',x, y')
self.assertRaises(ValueError, self.soup.select, 'x,,y') self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self): def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self): def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
def test_select_duplicate_elements(self):
# When markup contains duplicate elements, a multiple select
# will find all of them.
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
soup = BeautifulSoup(markup, 'html.parser')
selected = soup.select(".c1, .c2")
self.assertEqual(3, len(selected))
# Verify that find_all finds the same elements, though because
# of an implementation detail it finds them in a different
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected

@ -1,3 +0,0 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)

@ -1,23 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Execute computations asynchronously using threads or processes."""
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
from concurrent.futures._base import (FIRST_COMPLETED,
FIRST_EXCEPTION,
ALL_COMPLETED,
CancelledError,
TimeoutError,
Future,
Executor,
wait,
as_completed)
from concurrent.futures.thread import ThreadPoolExecutor
try:
from concurrent.futures.process import ProcessPoolExecutor
except ImportError:
# some platforms don't have multiprocessing
pass

@ -1,607 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
import collections
import logging
import threading
import itertools
import time
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
FIRST_COMPLETED = 'FIRST_COMPLETED'
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
ALL_COMPLETED = 'ALL_COMPLETED'
_AS_COMPLETED = '_AS_COMPLETED'
# Possible future states (for internal use by the futures package).
PENDING = 'PENDING'
RUNNING = 'RUNNING'
# The future was cancelled by the user...
CANCELLED = 'CANCELLED'
# ...and _Waiter.add_cancelled() was called by a worker.
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
FINISHED = 'FINISHED'
_FUTURE_STATES = [
PENDING,
RUNNING,
CANCELLED,
CANCELLED_AND_NOTIFIED,
FINISHED
]
_STATE_TO_DESCRIPTION_MAP = {
PENDING: "pending",
RUNNING: "running",
CANCELLED: "cancelled",
CANCELLED_AND_NOTIFIED: "cancelled",
FINISHED: "finished"
}
# Logger for internal use by the futures package.
LOGGER = logging.getLogger("concurrent.futures")
class Error(Exception):
"""Base class for all future-related exceptions."""
pass
class CancelledError(Error):
"""The Future was cancelled."""
pass
class TimeoutError(Error):
"""The operation exceeded the given deadline."""
pass
class _Waiter(object):
"""Provides the event that wait() and as_completed() block on."""
def __init__(self):
self.event = threading.Event()
self.finished_futures = []
def add_result(self, future):
self.finished_futures.append(future)
def add_exception(self, future):
self.finished_futures.append(future)
def add_cancelled(self, future):
self.finished_futures.append(future)
class _AsCompletedWaiter(_Waiter):
"""Used by as_completed()."""
def __init__(self):
super(_AsCompletedWaiter, self).__init__()
self.lock = threading.Lock()
def add_result(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _FirstCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_COMPLETED)."""
def add_result(self, future):
super(_FirstCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
super(_FirstCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
super(_FirstCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _AllCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
def __init__(self, num_pending_calls, stop_on_exception):
self.num_pending_calls = num_pending_calls
self.stop_on_exception = stop_on_exception
self.lock = threading.Lock()
super(_AllCompletedWaiter, self).__init__()
def _decrement_pending_calls(self):
with self.lock:
self.num_pending_calls -= 1
if not self.num_pending_calls:
self.event.set()
def add_result(self, future):
super(_AllCompletedWaiter, self).add_result(future)
self._decrement_pending_calls()
def add_exception(self, future):
super(_AllCompletedWaiter, self).add_exception(future)
if self.stop_on_exception:
self.event.set()
else:
self._decrement_pending_calls()
def add_cancelled(self, future):
super(_AllCompletedWaiter, self).add_cancelled(future)
self._decrement_pending_calls()
class _AcquireFutures(object):
"""A context manager that does an ordered acquire of Future conditions."""
def __init__(self, futures):
self.futures = sorted(futures, key=id)
def __enter__(self):
for future in self.futures:
future._condition.acquire()
def __exit__(self, *args):
for future in self.futures:
future._condition.release()
def _create_and_install_waiters(fs, return_when):
if return_when == _AS_COMPLETED:
waiter = _AsCompletedWaiter()
elif return_when == FIRST_COMPLETED:
waiter = _FirstCompletedWaiter()
else:
pending_count = sum(
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
if return_when == FIRST_EXCEPTION:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
elif return_when == ALL_COMPLETED:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
else:
raise ValueError("Invalid return condition: %r" % return_when)
for f in fs:
f._waiters.append(waiter)
return waiter
def as_completed(fs, timeout=None):
"""An iterator over the given futures that yields each as it completes.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
iterate over.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator that yields the given Futures as they complete (finished or
cancelled). If any given Futures are duplicated, they will be returned
once.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
"""
if timeout is not None:
end_time = timeout + time.time()
fs = set(fs)
with _AcquireFutures(fs):
finished = set(
f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
pending = fs - finished
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
try:
for future in finished:
yield future
while pending:
if timeout is None:
wait_timeout = None
else:
wait_timeout = end_time - time.time()
if wait_timeout < 0:
raise TimeoutError(
'%d (of %d) futures unfinished' % (
len(pending), len(fs)))
waiter.event.wait(wait_timeout)
with waiter.lock:
finished = waiter.finished_futures
waiter.finished_futures = []
waiter.event.clear()
for future in finished:
yield future
pending.remove(future)
finally:
for f in fs:
with f._condition:
f._waiters.remove(waiter)
DoneAndNotDoneFutures = collections.namedtuple(
'DoneAndNotDoneFutures', 'done not_done')
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
"""Wait for the futures in the given sequence to complete.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
wait upon.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
return_when: Indicates when this function should return. The options
are:
FIRST_COMPLETED - Return when any future finishes or is
cancelled.
FIRST_EXCEPTION - Return when any future finishes by raising an
exception. If no future raises an exception
then it is equivalent to ALL_COMPLETED.
ALL_COMPLETED - Return when all futures finish or are cancelled.
Returns:
A named 2-tuple of sets. The first set, named 'done', contains the
futures that completed (is finished or cancelled) before the wait
completed. The second set, named 'not_done', contains uncompleted
futures.
"""
with _AcquireFutures(fs):
done = set(f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
not_done = set(fs) - done
if (return_when == FIRST_COMPLETED) and done:
return DoneAndNotDoneFutures(done, not_done)
elif (return_when == FIRST_EXCEPTION) and done:
if any(f for f in done
if not f.cancelled() and f.exception() is not None):
return DoneAndNotDoneFutures(done, not_done)
if len(done) == len(fs):
return DoneAndNotDoneFutures(done, not_done)
waiter = _create_and_install_waiters(fs, return_when)
waiter.event.wait(timeout)
for f in fs:
with f._condition:
f._waiters.remove(waiter)
done.update(waiter.finished_futures)
return DoneAndNotDoneFutures(done, set(fs) - done)
class Future(object):
"""Represents the result of an asynchronous computation."""
def __init__(self):
"""Initializes the future. Should not be called by clients."""
self._condition = threading.Condition()
self._state = PENDING
self._result = None
self._exception = None
self._traceback = None
self._waiters = []
self._done_callbacks = []
def _invoke_callbacks(self):
for callback in self._done_callbacks:
try:
callback(self)
except Exception:
LOGGER.exception('exception calling callback for %r', self)
def __repr__(self):
with self._condition:
if self._state == FINISHED:
if self._exception:
return '<Future at %s state=%s raised %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._exception.__class__.__name__)
else:
return '<Future at %s state=%s returned %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._result.__class__.__name__)
return '<Future at %s state=%s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state])
def cancel(self):
"""Cancel the future if possible.
Returns True if the future was cancelled, False otherwise. A future
cannot be cancelled if it is running or has already completed.
"""
with self._condition:
if self._state in [RUNNING, FINISHED]:
return False
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
return True
self._state = CANCELLED
self._condition.notify_all()
self._invoke_callbacks()
return True
def cancelled(self):
"""Return True if the future has cancelled."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
def running(self):
"""Return True if the future is currently executing."""
with self._condition:
return self._state == RUNNING
def done(self):
"""Return True of the future was cancelled or finished executing."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
def __get_result(self):
if self._exception:
raise type(self._exception), self._exception, self._traceback
else:
return self._result
def add_done_callback(self, fn):
"""Attaches a callable that will be called when the future finishes.
Args:
fn: A callable that will be called with this future as its only
argument when the future completes or is cancelled. The callable
will always be called by a thread in the same process in which
it was added. If the future has already completed or been
cancelled then the callable will be called immediately. These
callables are called in the order that they were added.
"""
with self._condition:
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
self._done_callbacks.append(fn)
return
fn(self)
def result(self, timeout=None):
"""Return the result of the call that the future represents.
Args:
timeout: The number of seconds to wait for the result if the future
isn't done. If None, then there is no limit on the wait time.
Returns:
The result of the call that the future represents.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
Exception: If the call raised then that exception will be raised.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
else:
raise TimeoutError()
def exception_info(self, timeout=None):
"""Return a tuple of (exception, traceback) raised by the call that the
future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
else:
raise TimeoutError()
def exception(self, timeout=None):
"""Return the exception raised by the call that the future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
return self.exception_info(timeout)[0]
# The following methods should only be used by Executors and in tests.
def set_running_or_notify_cancel(self):
"""Mark the future as running or process any cancel notifications.
Should only be used by Executor implementations and unit tests.
If the future has been cancelled (cancel() was called and returned
True) then any threads waiting on the future completing (though calls
to as_completed() or wait()) are notified and False is returned.
If the future was not cancelled then it is put in the running state
(future calls to running() will return True) and True is returned.
This method should be called by Executor implementations before
executing the work associated with this future. If this method returns
False then the work should not be executed.
Returns:
False if the Future was cancelled, True otherwise.
Raises:
RuntimeError: if this method was already called or if set_result()
or set_exception() was called.
"""
with self._condition:
if self._state == CANCELLED:
self._state = CANCELLED_AND_NOTIFIED
for waiter in self._waiters:
waiter.add_cancelled(self)
# self._condition.notify_all() is not necessary because
# self.cancel() triggers a notification.
return False
elif self._state == PENDING:
self._state = RUNNING
return True
else:
LOGGER.critical('Future %s in unexpected state: %s',
id(self),
self._state)
raise RuntimeError('Future in unexpected state')
def set_result(self, result):
"""Sets the return value of work associated with the future.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._result = result
self._state = FINISHED
for waiter in self._waiters:
waiter.add_result(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception_info(self, exception, traceback):
"""Sets the result of the future as being the given exception
and traceback.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._exception = exception
self._traceback = traceback
self._state = FINISHED
for waiter in self._waiters:
waiter.add_exception(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception(self, exception):
"""Sets the result of the future as being the given exception.
Should only be used by Executor implementations and unit tests.
"""
self.set_exception_info(exception, None)
class Executor(object):
"""This is an abstract base class for concrete asynchronous executors."""
def submit(self, fn, *args, **kwargs):
"""Submits a callable to be executed with the given arguments.
Schedules the callable to be executed as fn(*args, **kwargs) and returns
a Future instance representing the execution of the callable.
Returns:
A Future representing the given call.
"""
raise NotImplementedError()
def map(self, fn, *iterables, **kwargs):
"""Returns a iterator equivalent to map(fn, iter).
Args:
fn: A callable that will take as many arguments as there are
passed iterables.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator equivalent to: map(func, *iterables) but the calls may
be evaluated out-of-order.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
Exception: If fn(*args) raises for any values.
"""
timeout = kwargs.get('timeout')
if timeout is not None:
end_time = timeout + time.time()
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
# Yield must be hidden in closure so that the futures are submitted
# before the first iterator value is required.
def result_iterator():
try:
for future in fs:
if timeout is None:
yield future.result()
else:
yield future.result(end_time - time.time())
finally:
for future in fs:
future.cancel()
return result_iterator()
def shutdown(self, wait=True):
"""Clean-up the resources associated with the Executor.
It is safe to call this method several times. Otherwise, no other
methods can be called after this one.
Args:
wait: If True then shutdown will not return until all running
futures have finished executing and the resources used by the
executor have been reclaimed.
"""
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.shutdown(wait=True)
return False

@ -1,359 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ProcessPoolExecutor.
The follow diagram and text describe the data-flow through the system:
|======================= In-process =====================|== Out-of-process ==|
+----------+ +----------+ +--------+ +-----------+ +---------+
| | => | Work Ids | => | | => | Call Q | => | |
| | +----------+ | | +-----------+ | |
| | | ... | | | | ... | | |
| | | 6 | | | | 5, call() | | |
| | | 7 | | | | ... | | |
| Process | | ... | | Local | +-----------+ | Process |
| Pool | +----------+ | Worker | | #1..n |
| Executor | | Thread | | |
| | +----------- + | | +-----------+ | |
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
| | +------------+ | | +-----------+ | |
| | | 6: call() | | | | ... | | |
| | | future | | | | 4, result | | |
| | | ... | | | | 3, except | | |
+----------+ +------------+ +--------+ +-----------+ +---------+
Executor.submit() called:
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
- adds the id of the _WorkItem to the "Work Ids" queue
Local worker thread:
- reads work ids from the "Work Ids" queue and looks up the corresponding
WorkItem from the "Work Items" dict: if the work item has been cancelled then
it is simply removed from the dict, otherwise it is repackaged as a
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
- reads _ResultItems from "Result Q", updates the future stored in the
"Work Items" dict and deletes the dict entry
Process #1..n:
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
_ResultItems in "Request Q"
"""
import atexit
from concurrent.futures import _base
import Queue as queue
import multiprocessing
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads and processes. This is done to allow the
# interpreter to exit when there are still idle processes in a
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
# allowing workers to die with the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads/processes finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
# Controls how many more calls than processes will be queued in the call queue.
# A smaller number will mean that processes spend more time idle waiting for
# work while a larger number will make Future.cancel() succeed less frequently
# (Futures in the call queue cannot be cancelled).
EXTRA_QUEUED_CALLS = 1
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
class _ResultItem(object):
def __init__(self, work_id, exception=None, result=None):
self.work_id = work_id
self.exception = exception
self.result = result
class _CallItem(object):
def __init__(self, work_id, fn, args, kwargs):
self.work_id = work_id
self.fn = fn
self.args = args
self.kwargs = kwargs
def _process_worker(call_queue, result_queue):
"""Evaluates calls from call_queue and places the results in result_queue.
This worker is run in a separate process.
Args:
call_queue: A multiprocessing.Queue of _CallItems that will be read and
evaluated by the worker.
result_queue: A multiprocessing.Queue of _ResultItems that will written
to by the worker.
shutdown: A multiprocessing.Event that will be set as a signal to the
worker that it should exit when call_queue is empty.
"""
while True:
call_item = call_queue.get(block=True)
if call_item is None:
# Wake up queue management thread
result_queue.put(None)
return
try:
r = call_item.fn(*call_item.args, **call_item.kwargs)
except BaseException:
e = sys.exc_info()[1]
result_queue.put(_ResultItem(call_item.work_id,
exception=e))
else:
result_queue.put(_ResultItem(call_item.work_id,
result=r))
def _add_call_item_to_queue(pending_work_items,
work_ids,
call_queue):
"""Fills call_queue with _WorkItems from pending_work_items.
This function never blocks.
Args:
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
are consumed and the corresponding _WorkItems from
pending_work_items are transformed into _CallItems and put in
call_queue.
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems.
"""
while True:
if call_queue.full():
return
try:
work_id = work_ids.get(block=False)
except queue.Empty:
return
else:
work_item = pending_work_items[work_id]
if work_item.future.set_running_or_notify_cancel():
call_queue.put(_CallItem(work_id,
work_item.fn,
work_item.args,
work_item.kwargs),
block=True)
else:
del pending_work_items[work_id]
continue
def _queue_management_worker(executor_reference,
processes,
pending_work_items,
work_ids_queue,
call_queue,
result_queue):
"""Manages the communication between this process and the worker processes.
This function is run in a local thread.
Args:
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
this thread. Used to determine if the ProcessPoolExecutor has been
garbage collected and that this function can exit.
process: A list of the multiprocessing.Process instances used as
workers.
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems for processing by the process workers.
result_queue: A multiprocessing.Queue of _ResultItems generated by the
process workers.
"""
nb_shutdown_processes = [0]
def shutdown_one_process():
"""Tell a worker to terminate, which will in turn wake us again"""
call_queue.put(None)
nb_shutdown_processes[0] += 1
while True:
_add_call_item_to_queue(pending_work_items,
work_ids_queue,
call_queue)
result_item = result_queue.get(block=True)
if result_item is not None:
work_item = pending_work_items[result_item.work_id]
del pending_work_items[result_item.work_id]
if result_item.exception:
work_item.future.set_exception(result_item.exception)
else:
work_item.future.set_result(result_item.result)
# Delete references to object. See issue16284
del work_item
# Check whether we should start shutting down.
executor = executor_reference()
# No more work items can be added if:
# - The interpreter is shutting down OR
# - The executor that owns this worker has been collected OR
# - The executor that owns this worker has been shutdown.
if _shutdown or executor is None or executor._shutdown_thread:
# Since no new work items can be added, it is safe to shutdown
# this thread if there are no pending work items.
if not pending_work_items:
while nb_shutdown_processes[0] < len(processes):
shutdown_one_process()
# If .join() is not called on the created processes then
# some multiprocessing.Queue methods may deadlock on Mac OS
# X.
for p in processes:
p.join()
call_queue.close()
return
del executor
_system_limits_checked = False
_system_limited = None
def _check_system_limits():
global _system_limits_checked, _system_limited
if _system_limits_checked:
if _system_limited:
raise NotImplementedError(_system_limited)
_system_limits_checked = True
try:
import os
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
except (AttributeError, ValueError):
# sysconf not available or setting not available
return
if nsems_max == -1:
# indetermine limit, assume that limit is determined
# by available memory only
return
if nsems_max >= 256:
# minimum number of semaphores available
# according to POSIX
return
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
raise NotImplementedError(_system_limited)
class ProcessPoolExecutor(_base.Executor):
def __init__(self, max_workers=None):
"""Initializes a new ProcessPoolExecutor instance.
Args:
max_workers: The maximum number of processes that can be used to
execute the given calls. If None or not given then as many
worker processes will be created as the machine has processors.
"""
_check_system_limits()
if max_workers is None:
self._max_workers = multiprocessing.cpu_count()
else:
self._max_workers = max_workers
# Make the call queue slightly larger than the number of processes to
# prevent the worker processes from idling. But don't make it too big
# because futures in the call queue cannot be cancelled.
self._call_queue = multiprocessing.Queue(self._max_workers +
EXTRA_QUEUED_CALLS)
self._result_queue = multiprocessing.Queue()
self._work_ids = queue.Queue()
self._queue_management_thread = None
self._processes = set()
# Shutdown is a two-step process.
self._shutdown_thread = False
self._shutdown_lock = threading.Lock()
self._queue_count = 0
self._pending_work_items = {}
def _start_queue_management_thread(self):
# When the executor gets lost, the weakref callback will wake up
# the queue management thread.
def weakref_cb(_, q=self._result_queue):
q.put(None)
if self._queue_management_thread is None:
self._queue_management_thread = threading.Thread(
target=_queue_management_worker,
args=(weakref.ref(self, weakref_cb),
self._processes,
self._pending_work_items,
self._work_ids,
self._call_queue,
self._result_queue))
self._queue_management_thread.daemon = True
self._queue_management_thread.start()
_threads_queues[self._queue_management_thread] = self._result_queue
def _adjust_process_count(self):
for _ in range(len(self._processes), self._max_workers):
p = multiprocessing.Process(
target=_process_worker,
args=(self._call_queue,
self._result_queue))
p.start()
self._processes.add(p)
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown_thread:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._pending_work_items[self._queue_count] = w
self._work_ids.put(self._queue_count)
self._queue_count += 1
# Wake up queue management thread
self._result_queue.put(None)
self._start_queue_management_thread()
self._adjust_process_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown_thread = True
if self._queue_management_thread:
# Wake up queue management thread
self._result_queue.put(None)
if wait:
self._queue_management_thread.join(sys.maxint)
# To reduce the risk of openning too many files, remove references to
# objects that use file descriptors.
self._queue_management_thread = None
self._call_queue = None
self._result_queue = None
self._processes = None
shutdown.__doc__ = _base.Executor.shutdown.__doc__
atexit.register(_python_exit)

@ -1,134 +0,0 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ThreadPoolExecutor."""
import atexit
from concurrent.futures import _base
import Queue as queue
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads. This is done to allow the interpreter
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
# pool (i.e. shutdown() was not called). However, allowing workers to die with
# the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
atexit.register(_python_exit)
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
def run(self):
if not self.future.set_running_or_notify_cancel():
return
try:
result = self.fn(*self.args, **self.kwargs)
except BaseException:
e, tb = sys.exc_info()[1:]
self.future.set_exception_info(e, tb)
else:
self.future.set_result(result)
def _worker(executor_reference, work_queue):
try:
while True:
work_item = work_queue.get(block=True)
if work_item is not None:
work_item.run()
# Delete references to object. See issue16284
del work_item
continue
executor = executor_reference()
# Exit if:
# - The interpreter is shutting down OR
# - The executor that owns the worker has been collected OR
# - The executor that owns the worker has been shutdown.
if _shutdown or executor is None or executor._shutdown:
# Notice other workers
work_queue.put(None)
return
del executor
except BaseException:
_base.LOGGER.critical('Exception in worker', exc_info=True)
class ThreadPoolExecutor(_base.Executor):
def __init__(self, max_workers):
"""Initializes a new ThreadPoolExecutor instance.
Args:
max_workers: The maximum number of threads that can be used to
execute the given calls.
"""
self._max_workers = max_workers
self._work_queue = queue.Queue()
self._threads = set()
self._shutdown = False
self._shutdown_lock = threading.Lock()
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._work_queue.put(w)
self._adjust_thread_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def _adjust_thread_count(self):
# When the executor gets lost, the weakref callback will wake up
# the worker threads.
def weakref_cb(_, q=self._work_queue):
q.put(None)
# TODO(bquinlan): Should avoid creating new threads if there are more
# idle threads than items in the work queue.
if len(self._threads) < self._max_workers:
t = threading.Thread(target=_worker,
args=(weakref.ref(self, weakref_cb),
self._work_queue))
t.daemon = True
t.start()
self._threads.add(t)
_threads_queues[t] = self._work_queue
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown = True
self._work_queue.put(None)
if wait:
for t in self._threads:
t.join(sys.maxint)
shutdown.__doc__ = _base.Executor.shutdown.__doc__

@ -1,73 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
"""Death by Captcha HTTP and socket API clients.
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
socket ones. Both offer the same functionalily, with the socket API
sporting faster responses and using way less connections.
To access the socket API, use SocketClient class; for the HTTP API, use
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
connection opened and serializes all API requests sent through it, thus
it is advised to keep a pool of them if you're script is heavily
multithreaded.
Both SocketClient and HttpClient give you the following methods:
get_user()
Returns your DBC account details as a dict with the following keys:
"user": your account numeric ID; if login fails, it will be the only
item with the value of 0;
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
solved CAPTCHA in US cents;
"balance": your DBC account balance in US cents;
"is_banned": flag indicating whether your account is suspended or not.
get_balance()
Returns your DBC account balance in US cents.
get_captcha(cid)
Returns an uploaded CAPTCHA details as a dict with the following keys:
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
be the only item with the value of 0;
"text": the CAPTCHA text, if solved, otherwise None;
"is_correct": flag indicating whether the CAPTCHA was solved correctly
(DBC can detect that in rare cases).
The only argument `cid` is the CAPTCHA numeric ID.
get_text(cid)
Returns an uploaded CAPTCHA text (None if not solved). The only argument
`cid` is the CAPTCHA numeric ID.
report(cid)
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
CAPTCHA numeric ID. Returns True on success, False otherwise.
upload(captcha)
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
object (any object with `read` method defined, actually, so StringIO
will do), or CAPTCHA image file name. On successul upload you'll get
the CAPTCHA details dict (see get_captcha() method).
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
to poll for its status periodically using get_captcha() or get_text()
method until the CAPTCHA is solved and you get the text.
decode(captcha, timeout=DEFAULT_TIMEOUT)
A convenient method that uploads a CAPTCHA and polls for its status
periodically, but no longer than `timeout` (defaults to 60 seconds).
If solved, you'll get the CAPTCHA details dict (see get_captcha()
method for details). See upload() method for details on `captcha`
argument.
Visit http://www.deathbycaptcha.com/user/api for updates.
"""
import base64 import base64
import binascii import binascii
import errno import errno
@ -79,8 +12,7 @@ import socket
import sys import sys
import threading import threading
import time import time
import urllib
import urllib2
try: try:
from json import read as json_decode, write as json_encode from json import read as json_decode, write as json_encode
except ImportError: except ImportError:
@ -89,64 +21,71 @@ except ImportError:
except ImportError: except ImportError:
from simplejson import loads as json_decode, dumps as json_encode from simplejson import loads as json_decode, dumps as json_encode
try:
from urllib2 import build_opener, HTTPRedirectHandler, Request, HTTPError
from urllib import urlencode, urlopen
except ImportError:
from urllib.request import build_opener, HTTPRedirectHandler, Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlencode
# API version and unique software ID # API version and unique software ID
API_VERSION = 'DBC/Python v4.6' API_VERSION = 'DBC/Python v4.0.11'
SOFTWARE_VENDOR_ID = 0
# Default CAPTCHA timeout and decode() polling interval # Default CAPTCHA timeout and decode() polling interval
DEFAULT_TIMEOUT = 60 DEFAULT_TIMEOUT = 60
DEFAULT_TOKEN_TIMEOUT = 120 POLLS_INTERVAL = 5
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
DFLT_POLL_INTERVAL = 3
# Base HTTP API url # Base HTTP API url
HTTP_BASE_URL = 'http://api.dbcapi.me/api' HTTP_BASE_URL = 'http://api.deathbycaptcha.com/api'
# Preferred HTTP API server's response content type, do not change # Preferred HTTP API server's response content type, do not change
HTTP_RESPONSE_TYPE = 'application/json' HTTP_RESPONSE_TYPE = 'application/json'
# Socket API server's host & ports range # Socket API server's host & ports range
SOCKET_HOST = 'api.dbcapi.me' SOCKET_HOST = 'api.deathbycaptcha.com'
SOCKET_PORTS = range(8123, 8131) SOCKET_PORTS = range(8123, 8131)
def _load_image(captcha):
if hasattr(captcha, 'read'):
img = captcha.read()
elif type(captcha) == bytearray:
img = captcha
else:
img = ''
try:
captcha_file = open(captcha, 'rb')
except Exception:
raise
else:
img = captcha_file.read()
captcha_file.close()
if not len(img):
raise ValueError('CAPTCHA image is empty')
elif imghdr.what(None, img) is None:
raise TypeError('Unknown CAPTCHA image type')
else:
return img
class AccessDeniedException(Exception): class AccessDeniedException(Exception):
pass pass
class Client(object): class Client(object):
"""Death by Captcha API Client"""
"""Death by Captcha API Client."""
def __init__(self, username, password): def __init__(self, username, password):
self.is_verbose = False self.is_verbose = False
self.userpwd = {'username': username, 'password': password} self.userpwd = {'username': username,
'password': password}
def _load_file(self, captcha):
if hasattr(captcha, 'read'):
raw_captcha = captcha.read()
elif isinstance(captcha, bytearray):
raw_captcha = captcha
elif os.path.isfile(captcha):
raw_captcha = ''
try:
f = open(captcha, 'rb')
except Exception as e:
raise e
else:
raw_captcha = f.read()
f.close()
else:
f_stream = urlopen(captcha)
raw_captcha = f_stream.read()
if not len(raw_captcha):
raise ValueError('CAPTCHA image is empty')
elif imghdr.what(None, raw_captcha) is None:
raise TypeError('Unknown CAPTCHA image type')
else:
return raw_captcha
def _log(self, cmd, msg=''): def _log(self, cmd, msg=''):
if self.is_verbose: if self.is_verbose:
print '%d %s %s' % (time.time(), cmd, msg.rstrip()) print('%d %s %s' % (time.time(), cmd, msg.rstrip()))
return self return self
def close(self): def close(self):
@ -156,16 +95,16 @@ class Client(object):
pass pass
def get_user(self): def get_user(self):
"""Fetch user details -- ID, balance, rate and banned status.""" """Fetch the user's details dict -- balance, rate and banned status."""
raise NotImplementedError() raise NotImplemented()
def get_balance(self): def get_balance(self):
"""Fetch user balance (in US cents).""" """Fetch the user's balance (in US cents)."""
return self.get_user().get('balance') return self.get_user().get('balance')
def get_captcha(self, cid): def get_captcha(self, cid):
"""Fetch a CAPTCHA details -- ID, text and correctness flag.""" """Fetch a CAPTCHA details dict -- its ID, text and correctness."""
raise NotImplementedError() raise NotImplemented()
def get_text(self, cid): def get_text(self, cid):
"""Fetch a CAPTCHA text.""" """Fetch a CAPTCHA text."""
@ -173,7 +112,11 @@ class Client(object):
def report(self, cid): def report(self, cid):
"""Report a CAPTCHA as incorrectly solved.""" """Report a CAPTCHA as incorrectly solved."""
raise NotImplementedError() raise NotImplemented()
def remove(self, cid):
"""Remove an unsolved CAPTCHA."""
raise NotImplemented()
def upload(self, captcha): def upload(self, captcha):
"""Upload a CAPTCHA. """Upload a CAPTCHA.
@ -182,56 +125,32 @@ class Client(object):
dict on success. dict on success.
""" """
raise NotImplementedError() raise NotImplemented()
def decode(self, captcha=None, timeout=None, **kwargs): def decode(self, captcha, timeout=DEFAULT_TIMEOUT):
""" """Try to solve a CAPTCHA.
Try to solve a CAPTCHA.
See Client.upload() for arguments details. See Client.upload() for arguments details.
Uploads a CAPTCHA, polls for its status periodically with arbitrary Uploads a CAPTCHA, polls for its status periodically with arbitrary
timeout (in seconds), returns CAPTCHA details if (correctly) solved. timeout (in seconds), returns CAPTCHA details if (correctly) solved.
"""
if not timeout:
if not captcha:
timeout = DEFAULT_TOKEN_TIMEOUT
else:
timeout = DEFAULT_TIMEOUT
"""
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT) deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
uploaded_captcha = self.upload(captcha, **kwargs) c = self.upload(captcha)
if uploaded_captcha: if c:
intvl_idx = 0 # POLL_INTERVAL index while deadline > time.time() and not c.get('text'):
while deadline > time.time() and not uploaded_captcha.get('text'): time.sleep(POLLS_INTERVAL)
intvl, intvl_idx = self._get_poll_interval(intvl_idx) c = self.get_captcha(c['captcha'])
time.sleep(intvl) if c.get('text') and c.get('is_correct'):
pulled = self.get_captcha(uploaded_captcha['captcha']) return c
if pulled['captcha'] == uploaded_captcha['captcha']:
uploaded_captcha = pulled
if uploaded_captcha.get('text') and \
uploaded_captcha.get('is_correct'):
return uploaded_captcha
def _get_poll_interval(self, idx):
"""Returns poll interval and next index depending on index provided"""
if len(POLLS_INTERVAL) > idx:
intvl = POLLS_INTERVAL[idx]
else:
intvl = DFLT_POLL_INTERVAL
idx += 1
return intvl, idx
class HttpClient(Client): class HttpClient(Client):
"""Death by Captcha HTTP API client.""" """Death by Captcha HTTP API client."""
def __init__(self, *args): def __init__(self, *args):
Client.__init__(self, *args) Client.__init__(self, *args)
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) self.opener = build_opener(HTTPRedirectHandler())
def _call(self, cmd, payload=None, headers=None): def _call(self, cmd, payload=None, headers=None):
if headers is None: if headers is None:
@ -239,30 +158,22 @@ class HttpClient(Client):
headers['Accept'] = HTTP_RESPONSE_TYPE headers['Accept'] = HTTP_RESPONSE_TYPE
headers['User-Agent'] = API_VERSION headers['User-Agent'] = API_VERSION
if hasattr(payload, 'items'): if hasattr(payload, 'items'):
payload = urllib.urlencode(payload) payload = urlencode(payload)
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload)) self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
else:
self._log('SEND', '%s' % cmd)
if payload is not None: if payload is not None:
headers['Content-Length'] = len(payload) headers['Content-Length'] = len(payload)
try: try:
response = self.opener.open(urllib2.Request( response = self.opener.open(Request(
HTTP_BASE_URL + '/' + cmd.strip('/'), HTTP_BASE_URL + '/' + cmd.strip('/'),
data=payload, data=payload,
headers=headers headers=headers
)).read() )).read()
except urllib2.HTTPError, err: except HTTPError as e:
if 403 == err.code: if 403 == e.code:
raise AccessDeniedException('Access denied, please check' raise AccessDeniedException(
' your credentials and/or balance') 'Access denied, please check your credentials and/or balance')
elif 400 == err.code or 413 == err.code: elif 400 == e.code or 413 == e.code:
raise ValueError("CAPTCHA was rejected by the service, check" raise ValueError("CAPTCHA was rejected by the service, check if it's a valid image")
" if it's a valid image")
elif 503 == err.code:
raise OverflowError("CAPTCHA was rejected due to service"
" overload, try again later")
else:
raise err
else: else:
self._log('RECV', '%d %s' % (len(response), response)) self._log('RECV', '%d %s' % (len(response), response))
try: try:
@ -281,53 +192,38 @@ class HttpClient(Client):
return not self._call('captcha/%d/report' % cid, return not self._call('captcha/%d/report' % cid,
self.userpwd.copy()).get('is_correct') self.userpwd.copy()).get('is_correct')
def upload(self, captcha=None, **kwargs): def remove(self, cid):
boundary = binascii.hexlify(os.urandom(16)) return not self._call('captcha/%d/remove' % cid,
banner = kwargs.get('banner', '') self.userpwd.copy()).get('captcha')
if banner:
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
body = '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in self.userpwd.items())
body += '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in kwargs.items())
if captcha:
img = _load_image(captcha)
body += '\r\n'.join((
'',
'--%s' % boundary,
'Content-Disposition: form-data; name="captchafile"; '
'filename="captcha"',
'Content-Type: application/octet-stream',
'Content-Length: %d' % len(img),
'',
img,
'--%s--' % boundary,
''
))
def upload(self, captcha):
boundary = binascii.hexlify(os.urandom(16))
data = self.userpwd.copy()
data['swid'] = SOFTWARE_VENDOR_ID
body = '\r\n'.join(('\r\n'.join(('--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v))))
for k, v in data.items())
captcha = self._load_file(captcha)
body += '\r\n'.join(('',
'--%s' % boundary,
'Content-Disposition: form-data; name="captchafile"; filename="captcha"',
'Content-Type: application/octet-stream',
'Content-Length: %d' % len(captcha),
'',
captcha,
'--%s--' % boundary,
''))
response = self._call('captcha', body, { response = self._call('captcha', body, {
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary 'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
}) or {} }) or {}
if response.get('captcha'): if response.get('captcha'):
return response return response
class SocketClient(Client): class SocketClient(Client):
"""Death by Captcha socket API client.""" """Death by Captcha socket API client."""
TERMINATOR = '\r\n' TERMINATOR = '\r\n'
@ -357,11 +253,12 @@ class SocketClient(Client):
self.socket.settimeout(0) self.socket.settimeout(0)
try: try:
self.socket.connect(host) self.socket.connect(host)
except socket.error, err: except socket.error as e:
if (err.args[0] not in if errno.EINPROGRESS == e[0]:
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): pass
else:
self.close() self.close()
raise err raise e
return self.socket return self.socket
def __del__(self): def __del__(self):
@ -372,30 +269,27 @@ class SocketClient(Client):
fds = [sock] fds = [sock]
buf += self.TERMINATOR buf += self.TERMINATOR
response = '' response = ''
intvl_idx = 0
while True: while True:
intvl, intvl_idx = self._get_poll_interval(intvl_idx) rd, wr, ex = select.select((not buf and fds) or [],
rds, wrs, exs = select.select((not buf and fds) or [], (buf and fds) or [],
(buf and fds) or [], fds,
fds, POLLS_INTERVAL)
intvl) if ex:
if exs:
raise IOError('select() failed') raise IOError('select() failed')
try: try:
if wrs: if wr:
while buf: while buf:
buf = buf[wrs[0].send(buf):] buf = buf[wr[0].send(buf):]
elif rds: elif rd:
while True: while True:
s = rds[0].recv(256) s = rd[0].recv(256)
if not s: if not s:
raise IOError('recv(): connection lost') raise IOError('recv(): connection lost')
else: else:
response += s response += s
except socket.error, err: except socket.error as e:
if (err.args[0] not in if e[0] not in (errno.EAGAIN, errno.EINPROGRESS):
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): raise e
raise err
if response.endswith(self.TERMINATOR): if response.endswith(self.TERMINATOR):
self._log('RECV', response) self._log('RECV', response)
return response.rstrip(self.TERMINATOR) return response.rstrip(self.TERMINATOR)
@ -409,18 +303,16 @@ class SocketClient(Client):
request = json_encode(data) request = json_encode(data)
response = None response = None
for _ in range(2): for i in range(2):
if not self.socket and cmd != 'login':
self._call('login', self.userpwd.copy())
self.socket_lock.acquire() self.socket_lock.acquire()
try: try:
sock = self.connect() sock = self.connect()
response = self._sendrecv(sock, request) response = self._sendrecv(sock, request)
except IOError, err: except IOError as e:
sys.stderr.write(str(err) + "\n") sys.stderr.write(str(e) + "\n")
self.close() self.close()
except socket.error, err: except socket.error as e:
sys.stderr.write(str(err) + "\n") sys.stderr.write(str(e) + "\n")
self.close() self.close()
raise IOError('Connection refused') raise IOError('Connection refused')
else: else:
@ -428,89 +320,84 @@ class SocketClient(Client):
finally: finally:
self.socket_lock.release() self.socket_lock.release()
if response is None:
raise IOError('Connection lost or timed out during API request')
try: try:
response = json_decode(response) if response is None:
except Exception: raise IOError('Connection lost timed out during API request')
raise RuntimeError('Invalid API response') try:
response = json_decode(response)
if not response.get('error'): except Exception:
return response raise RuntimeError('Invalid API response')
if 'error' in response:
error = response['error'] error = response['error']
if error in ('not-logged-in', 'invalid-credentials'): if 'not-logged-in' == error:
raise AccessDeniedException('Access denied, check your credentials') raise AccessDeniedException('Access denied, check your credentials')
elif 'banned' == error: elif 'banned' == error:
raise AccessDeniedException('Access denied, account is suspended') raise AccessDeniedException('Access denied, account is suspended')
elif 'insufficient-funds' == error: elif 'insufficient-funds' == error:
raise AccessDeniedException( raise AccessDeniedException('CAPTCHA was rejected due to low balance')
'CAPTCHA was rejected due to low balance') elif 'invalid-captcha' == error:
elif 'invalid-captcha' == error: raise ValueError('CAPTCHA is not a valid image')
raise ValueError('CAPTCHA is not a valid image') elif 'service-overload' == error:
elif 'service-overload' == error: raise ValueError(
raise OverflowError( 'CAPTCHA was rejected due to service overload, try again later')
'CAPTCHA was rejected due to service overload, try again later') else:
else: raise RuntimeError('API server error occured: %s' % error)
except Exception as e:
self.socket_lock.acquire() self.socket_lock.acquire()
self.close() self.close()
self.socket_lock.release() self.socket_lock.release()
raise RuntimeError('API server error occured: %s' % error) raise e
else:
return response
def get_user(self): def get_user(self):
return self._call('user') or {'user': 0} return self._call('user', self.userpwd.copy()) or {'user': 0}
def get_captcha(self, cid): def get_captcha(self, cid):
return self._call('captcha', {'captcha': cid}) or {'captcha': 0} return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
def upload(self, captcha=None, **kwargs): def upload(self, captcha):
data = {} data = self.userpwd.copy()
if captcha: data['captcha'] = base64.b64encode(self._load_file(captcha))
data['captcha'] = base64.b64encode(_load_image(captcha))
if kwargs:
banner = kwargs.get('banner', '')
if banner:
kwargs['banner'] = base64.b64encode(_load_image(banner))
data.update(kwargs)
response = self._call('upload', data) response = self._call('upload', data)
if response.get('captcha'): if response.get('captcha'):
uploaded_captcha = dict( return dict((k, response.get(k)) for k in ('captcha', 'text', 'is_correct'))
(k, response.get(k))
for k in ('captcha', 'text', 'is_correct')
)
if not uploaded_captcha['text']:
uploaded_captcha['text'] = None
return uploaded_captcha
def report(self, cid): def report(self, cid):
return not self._call('report', {'captcha': cid}).get('is_correct') data = self.userpwd.copy()
data['captcha'] = cid
return not self._call('report', data).get('is_correct')
def remove(self, cid):
data = self.userpwd.copy()
data['captcha'] = cid
return not self._call('remove', data).get('captcha')
if '__main__' == __name__: if '__main__' == __name__:
import sys
# Put your DBC username & password here: # Put your DBC username & password here:
# client = HttpClient(sys.argv[1], sys.argv[2]) #client = HttpClient(sys.argv[1], sys.argv[2])
client = SocketClient(sys.argv[1], sys.argv[2]) client = SocketClient(sys.argv[1], sys.argv[2])
client.is_verbose = True client.is_verbose = True
print 'Your balance is %s US cents' % client.get_balance() print('Your balance is %s US cents' % client.get_balance())
for fn in sys.argv[3:]: for fn in sys.argv[3:]:
try: try:
# Put your CAPTCHA image file name or file-like object, and optional # Put your CAPTCHA image file name or file-like object, and optional
# solving timeout (in seconds) here: # solving timeout (in seconds) here:
captcha = client.decode(fn, DEFAULT_TIMEOUT) captcha = client.decode(fn, DEFAULT_TIMEOUT)
except Exception, e: except Exception as e:
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, )) sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
captcha = None captcha = None
if captcha: if captcha:
print 'CAPTCHA %d solved: %s' % \ print('CAPTCHA %d solved: %s' % (captcha['captcha'], captcha['text']))
(captcha['captcha'], captcha['text'])
# Report as incorrectly solved if needed. Make sure the CAPTCHA was # Report as incorrectly solved if needed. Make sure the CAPTCHA was
# in fact incorrectly solved! # in fact incorrectly solved!
# try: try:
# client.report(captcha['captcha']) client.report(captcha['captcha'])
# except Exception, e: except Exception as e:
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, )) sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))

@ -40,7 +40,7 @@ import operator
import itertools import itertools
import collections import collections
__version__ = '4.3.0' __version__ = '4.4.0'
if sys.version >= '3': if sys.version >= '3':
from inspect import getfullargspec from inspect import getfullargspec
@ -65,6 +65,12 @@ except AttributeError:
# let's assume there are no coroutine functions in old Python # let's assume there are no coroutine functions in old Python
def iscoroutinefunction(f): def iscoroutinefunction(f):
return False return False
try:
from inspect import isgeneratorfunction
except ImportError:
# assume no generator function in old Python versions
def isgeneratorfunction(caller):
return False
DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(') DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(')
@ -173,7 +179,8 @@ class FunctionMaker(object):
# Ensure each generated function has a unique filename for profilers # Ensure each generated function has a unique filename for profilers
# (such as cProfile) that depend on the tuple of (<filename>, # (such as cProfile) that depend on the tuple of (<filename>,
# <definition line>, <function name>) being unique. # <definition line>, <function name>) being unique.
filename = '<decorator-gen-%d>' % (next(self._compile_count),) filename = '<%s:decorator-gen-%d>' % (
__file__, next(self._compile_count))
try: try:
code = compile(src, filename, 'single') code = compile(src, filename, 'single')
exec(code, evaldict) exec(code, evaldict)
@ -218,6 +225,8 @@ class FunctionMaker(object):
def decorate(func, caller, extras=()): def decorate(func, caller, extras=()):
""" """
decorate(func, caller) decorates a function using a caller. decorate(func, caller) decorates a function using a caller.
If the caller is a generator function, the resulting function
will be a generator function.
""" """
evaldict = dict(_call_=caller, _func_=func) evaldict = dict(_call_=caller, _func_=func)
es = '' es = ''
@ -225,9 +234,23 @@ def decorate(func, caller, extras=()):
ex = '_e%d_' % i ex = '_e%d_' % i
evaldict[ex] = extra evaldict[ex] = extra
es += ex + ', ' es += ex + ', '
fun = FunctionMaker.create(
func, "return _call_(_func_, %s%%(shortsignature)s)" % es, if '3.5' <= sys.version < '3.6':
evaldict, __wrapped__=func) # with Python 3.5 isgeneratorfunction returns True for all coroutines
# however we know that it is NOT possible to have a generator
# coroutine in python 3.5: PEP525 was not there yet
generatorcaller = isgeneratorfunction(
caller) and not iscoroutinefunction(caller)
else:
generatorcaller = isgeneratorfunction(caller)
if generatorcaller:
fun = FunctionMaker.create(
func, "for res in _call_(_func_, %s%%(shortsignature)s):\n"
" yield res" % es, evaldict, __wrapped__=func)
else:
fun = FunctionMaker.create(
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
evaldict, __wrapped__=func)
if hasattr(func, '__qualname__'): if hasattr(func, '__qualname__'):
fun.__qualname__ = func.__qualname__ fun.__qualname__ = func.__qualname__
return fun return fun
@ -261,12 +284,12 @@ def decorator(caller, _func=None):
doc = caller.__call__.__doc__ doc = caller.__call__.__doc__
evaldict = dict(_call=caller, _decorate_=decorate) evaldict = dict(_call=caller, _decorate_=decorate)
dec = FunctionMaker.create( dec = FunctionMaker.create(
'%s(%s func)' % (name, defaultargs), '%s(func, %s)' % (name, defaultargs),
'if func is None: return lambda func: _decorate_(func, _call, (%s))\n' 'if func is None: return lambda func: _decorate_(func, _call, (%s))\n'
'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs), 'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs),
evaldict, doc=doc, module=caller.__module__, __wrapped__=caller) evaldict, doc=doc, module=caller.__module__, __wrapped__=caller)
if defaults: if defaults:
dec.__defaults__ = defaults + (None,) dec.__defaults__ = (None,) + defaults
return dec return dec

@ -1,4 +1,4 @@
__version__ = '0.6.5' __version__ = '0.7.1'
from .lock import Lock # noqa from .lock import Lock # noqa
from .lock import NeedRegenerationException # noqa from .lock import NeedRegenerationException # noqa

@ -10,8 +10,9 @@ from ..util import compat
import time import time
import datetime import datetime
from numbers import Number from numbers import Number
from functools import wraps from functools import wraps, partial
import threading import threading
from decorator import decorate
_backend_loader = PluginLoader("dogpile.cache") _backend_loader = PluginLoader("dogpile.cache")
register_backend = _backend_loader.register register_backend = _backend_loader.register
@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy):
class CacheRegion(object): class CacheRegion(object):
"""A front end to a particular cache backend. r"""A front end to a particular cache backend.
:param name: Optional, a string name for the region. :param name: Optional, a string name for the region.
This isn't used internally This isn't used internally
@ -484,6 +485,26 @@ class CacheRegion(object):
else: else:
return self._LockWrapper() return self._LockWrapper()
# cached value
_actual_backend = None
@property
def actual_backend(self):
"""Return the ultimate backend underneath any proxies.
The backend might be the result of one or more ``proxy.wrap``
applications. If so, derive the actual underlying backend.
.. versionadded:: 0.6.6
"""
if self._actual_backend is None:
_backend = self.backend
while hasattr(_backend, 'proxied'):
_backend = _backend.proxied
self._actual_backend = _backend
return self._actual_backend
def invalidate(self, hard=True): def invalidate(self, hard=True):
"""Invalidate this :class:`.CacheRegion`. """Invalidate this :class:`.CacheRegion`.
@ -723,7 +744,8 @@ class CacheRegion(object):
] ]
def get_or_create( def get_or_create(
self, key, creator, expiration_time=None, should_cache_fn=None): self, key, creator, expiration_time=None, should_cache_fn=None,
creator_args=None):
"""Return a cached value based on the given key. """Return a cached value based on the given key.
If the value does not exist or is considered to be expired If the value does not exist or is considered to be expired
@ -759,6 +781,11 @@ class CacheRegion(object):
:param creator: function which creates a new value. :param creator: function which creates a new value.
:param creator_args: optional tuple of (args, kwargs) that will be
passed to the creator function if present.
.. versionadded:: 0.7.0
:param expiration_time: optional expiration time which will overide :param expiration_time: optional expiration time which will overide
the expiration time already configured on this :class:`.CacheRegion` the expiration time already configured on this :class:`.CacheRegion`
if not None. To set no expiration, use the value -1. if not None. To set no expiration, use the value -1.
@ -799,7 +826,7 @@ class CacheRegion(object):
value = self.backend.get(key) value = self.backend.get(key)
if (value is NO_VALUE or value.metadata['v'] != value_version or if (value is NO_VALUE or value.metadata['v'] != value_version or
self.region_invalidator.is_hard_invalidated( self.region_invalidator.is_hard_invalidated(
value.metadata["ct"])): value.metadata["ct"])):
raise NeedRegenerationException() raise NeedRegenerationException()
ct = value.metadata["ct"] ct = value.metadata["ct"]
if self.region_invalidator.is_soft_invalidated(ct): if self.region_invalidator.is_soft_invalidated(ct):
@ -808,7 +835,10 @@ class CacheRegion(object):
return value.payload, ct return value.payload, ct
def gen_value(): def gen_value():
created_value = creator() if creator_args:
created_value = creator(*creator_args[0], **creator_args[1])
else:
created_value = creator()
value = self._value(created_value) value = self._value(created_value)
if not should_cache_fn or \ if not should_cache_fn or \
@ -831,8 +861,13 @@ class CacheRegion(object):
if self.async_creation_runner: if self.async_creation_runner:
def async_creator(mutex): def async_creator(mutex):
return self.async_creation_runner( if creator_args:
self, orig_key, creator, mutex) @wraps(creator)
def go():
return creator(*creator_args[0], **creator_args[1])
else:
go = creator
return self.async_creation_runner(self, orig_key, go, mutex)
else: else:
async_creator = None async_creator = None
@ -896,7 +931,7 @@ class CacheRegion(object):
if (value is NO_VALUE or value.metadata['v'] != value_version or if (value is NO_VALUE or value.metadata['v'] != value_version or
self.region_invalidator.is_hard_invalidated( self.region_invalidator.is_hard_invalidated(
value.metadata['v'])): value.metadata['ct'])):
# dogpile.core understands a 0 here as # dogpile.core understands a 0 here as
# "the value is not available", e.g. # "the value is not available", e.g.
# _has_value() will return False. # _has_value() will return False.
@ -1228,26 +1263,31 @@ class CacheRegion(object):
if function_key_generator is None: if function_key_generator is None:
function_key_generator = self.function_key_generator function_key_generator = self.function_key_generator
def decorator(fn): def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
key = key_generator(*arg, **kw)
timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
return self.get_or_create(key, user_func, timeout,
should_cache_fn, (arg, kw))
def cache_decorator(user_func):
if to_str is compat.string_type: if to_str is compat.string_type:
# backwards compatible # backwards compatible
key_generator = function_key_generator(namespace, fn) key_generator = function_key_generator(namespace, user_func)
else: else:
key_generator = function_key_generator( key_generator = function_key_generator(
namespace, fn, namespace, user_func,
to_str=to_str) to_str=to_str)
@wraps(fn) def refresh(*arg, **kw):
def decorate(*arg, **kw): """
Like invalidate, but regenerates the value instead
"""
key = key_generator(*arg, **kw) key = key_generator(*arg, **kw)
value = user_func(*arg, **kw)
@wraps(fn) self.set(key, value)
def creator(): return value
return fn(*arg, **kw)
timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
return self.get_or_create(key, creator, timeout,
should_cache_fn)
def invalidate(*arg, **kw): def invalidate(*arg, **kw):
key = key_generator(*arg, **kw) key = key_generator(*arg, **kw)
@ -1261,20 +1301,18 @@ class CacheRegion(object):
key = key_generator(*arg, **kw) key = key_generator(*arg, **kw)
return self.get(key) return self.get(key)
def refresh(*arg, **kw): user_func.set = set_
key = key_generator(*arg, **kw) user_func.invalidate = invalidate
value = fn(*arg, **kw) user_func.get = get
self.set(key, value) user_func.refresh = refresh
return value user_func.original = user_func
decorate.set = set_ # Use `decorate` to preserve the signature of :param:`user_func`.
decorate.invalidate = invalidate
decorate.refresh = refresh
decorate.get = get
decorate.original = fn
return decorate return decorate(user_func, partial(
return decorator get_or_create_for_user_func, key_generator))
return cache_decorator
def cache_multi_on_arguments( def cache_multi_on_arguments(
self, namespace=None, expiration_time=None, self, namespace=None, expiration_time=None,
@ -1402,50 +1440,49 @@ class CacheRegion(object):
if function_multi_key_generator is None: if function_multi_key_generator is None:
function_multi_key_generator = self.function_multi_key_generator function_multi_key_generator = self.function_multi_key_generator
def decorator(fn): def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
key_generator = function_multi_key_generator( cache_keys = arg
namespace, fn, keys = key_generator(*arg, **kw)
to_str=to_str) key_lookup = dict(zip(keys, cache_keys))
@wraps(fn) @wraps(user_func)
def decorate(*arg, **kw): def creator(*keys_to_create):
cache_keys = arg return user_func(*[key_lookup[k] for k in keys_to_create])
keys = key_generator(*arg, **kw)
key_lookup = dict(zip(keys, cache_keys)) timeout = expiration_time() if expiration_time_is_callable \
else expiration_time
@wraps(fn)
def creator(*keys_to_create): if asdict:
return fn(*[key_lookup[k] for k in keys_to_create]) def dict_create(*keys):
d_values = creator(*keys)
timeout = expiration_time() if expiration_time_is_callable \ return [
else expiration_time d_values.get(key_lookup[k], NO_VALUE)
for k in keys]
def wrap_cache_fn(value):
if value is NO_VALUE:
return False
elif not should_cache_fn:
return True
else:
return should_cache_fn(value)
result = self.get_or_create_multi(
keys, dict_create, timeout, wrap_cache_fn)
result = dict(
(k, v) for k, v in zip(cache_keys, result)
if v is not NO_VALUE)
else:
result = self.get_or_create_multi(
keys, creator, timeout,
should_cache_fn)
if asdict: return result
def dict_create(*keys):
d_values = creator(*keys)
return [
d_values.get(key_lookup[k], NO_VALUE)
for k in keys]
def wrap_cache_fn(value):
if value is NO_VALUE:
return False
elif not should_cache_fn:
return True
else:
return should_cache_fn(value)
result = self.get_or_create_multi(
keys, dict_create, timeout, wrap_cache_fn)
result = dict(
(k, v) for k, v in zip(cache_keys, result)
if v is not NO_VALUE)
else:
result = self.get_or_create_multi(
keys, creator, timeout,
should_cache_fn)
return result def cache_decorator(user_func):
key_generator = function_multi_key_generator(
namespace, user_func,
to_str=to_str)
def invalidate(*arg): def invalidate(*arg):
keys = key_generator(*arg) keys = key_generator(*arg)
@ -1466,7 +1503,7 @@ class CacheRegion(object):
def refresh(*arg): def refresh(*arg):
keys = key_generator(*arg) keys = key_generator(*arg)
values = fn(*arg) values = user_func(*arg)
if asdict: if asdict:
self.set_multi( self.set_multi(
dict(zip(keys, [values[a] for a in arg])) dict(zip(keys, [values[a] for a in arg]))
@ -1478,13 +1515,18 @@ class CacheRegion(object):
) )
return values return values
decorate.set = set_ user_func.set = set_
decorate.invalidate = invalidate user_func.invalidate = invalidate
decorate.refresh = refresh user_func.refresh = refresh
decorate.get = get user_func.get = get
# Use `decorate` to preserve the signature of :param:`user_func`.
return decorate(user_func, partial(get_or_create_for_user_func, key_generator))
return cache_decorator
return decorate
return decorator
def make_region(*arg, **kw): def make_region(*arg, **kw):

@ -1,5 +1,4 @@
from hashlib import sha1 from hashlib import sha1
import inspect
from ..util import compat from ..util import compat
from ..util import langhelpers from ..util import langhelpers
@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type):
else: else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn) args = compat.inspect_getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls') has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_key(*args, **kw): def generate_key(*args, **kw):
@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
else: else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn) args = compat.inspect_getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls') has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_keys(*args, **kw): def generate_keys(*args, **kw):
@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
else: else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
argspec = inspect.getargspec(fn) argspec = compat.inspect_getargspec(fn)
default_list = list(argspec.defaults or []) default_list = list(argspec.defaults or [])
# Reverse the list, as we want to compare the argspec by negative index, # Reverse the list, as we want to compare the argspec by negative index,
# meaning default_list[0] should be args[-1], which works well with # meaning default_list[0] should be args[-1], which works well with

@ -69,11 +69,10 @@ class Lock(object):
"""Return true if the expiration time is reached, or no """Return true if the expiration time is reached, or no
value is available.""" value is available."""
return not self._has_value(createdtime) or \ return not self._has_value(createdtime) or (
( self.expiretime is not None and
self.expiretime is not None and time.time() - createdtime > self.expiretime
time.time() - createdtime > self.expiretime )
)
def _has_value(self, createdtime): def _has_value(self, createdtime):
"""Return true if the creation function has proceeded """Return true if the creation function has proceeded
@ -91,68 +90,100 @@ class Lock(object):
value = NOT_REGENERATED value = NOT_REGENERATED
createdtime = -1 createdtime = -1
generated = self._enter_create(createdtime) generated = self._enter_create(value, createdtime)
if generated is not NOT_REGENERATED: if generated is not NOT_REGENERATED:
generated, createdtime = generated generated, createdtime = generated
return generated return generated
elif value is NOT_REGENERATED: elif value is NOT_REGENERATED:
# we called upon the creator, and it said that it
# didn't regenerate. this typically means another
# thread is running the creation function, and that the
# cache should still have a value. However,
# we don't have a value at all, which is unusual since we just
# checked for it, so check again (TODO: is this a real codepath?)
try: try:
value, createdtime = value_fn() value, createdtime = value_fn()
return value return value
except NeedRegenerationException: except NeedRegenerationException:
raise Exception("Generation function should " raise Exception(
"have just been called by a concurrent " "Generation function should "
"thread.") "have just been called by a concurrent "
"thread.")
else: else:
return value return value
def _enter_create(self, createdtime): def _enter_create(self, value, createdtime):
if not self._is_expired(createdtime): if not self._is_expired(createdtime):
return NOT_REGENERATED return NOT_REGENERATED
async = False _async = False
if self._has_value(createdtime): if self._has_value(createdtime):
has_value = True
if not self.mutex.acquire(False): if not self.mutex.acquire(False):
log.debug("creation function in progress " log.debug(
"elsewhere, returning") "creation function in progress "
"elsewhere, returning")
return NOT_REGENERATED return NOT_REGENERATED
else: else:
has_value = False
log.debug("no value, waiting for create lock") log.debug("no value, waiting for create lock")
self.mutex.acquire() self.mutex.acquire()
try: try:
log.debug("value creation lock %r acquired" % self.mutex) log.debug("value creation lock %r acquired" % self.mutex)
# see if someone created the value already if not has_value:
try: # we entered without a value, or at least with "creationtime ==
value, createdtime = self.value_and_created_fn() # 0". Run the "getter" function again, to see if another
except NeedRegenerationException: # thread has already generated the value while we waited on the
pass # mutex, or if the caller is otherwise telling us there is a
else: # value already which allows us to use async regeneration. (the
if not self._is_expired(createdtime): # latter is used by the multi-key routine).
log.debug("value already present") try:
return value, createdtime value, createdtime = self.value_and_created_fn()
elif self.async_creator: except NeedRegenerationException:
log.debug("Passing creation lock to async runner") # nope, nobody created the value, we're it.
self.async_creator(self.mutex) # we must create it right now
async = True pass
return value, createdtime else:
has_value = True
log.debug("Calling creation function") # caller is telling us there is a value and that we can
created = self.creator() # use async creation if it is expired.
return created if not self._is_expired(createdtime):
# it's not expired, return it
log.debug("Concurrent thread created the value")
return value, createdtime
# otherwise it's expired, call creator again
if has_value and self.async_creator:
# we have a value we can return, safe to use async_creator
log.debug("Passing creation lock to async runner")
# so...run it!
self.async_creator(self.mutex)
_async = True
# and return the expired value for now
return value, createdtime
# it's expired, and it's our turn to create it synchronously, *or*,
# there's no value at all, and we have to create it synchronously
log.debug(
"Calling creation function for %s value",
"not-yet-present" if not has_value else
"previously expired"
)
return self.creator()
finally: finally:
if not async: if not _async:
self.mutex.release() self.mutex.release()
log.debug("Released creation lock") log.debug("Released creation lock")
def __enter__(self): def __enter__(self):
return self._enter() return self._enter()
def __exit__(self, type, value, traceback): def __exit__(self, type, value, traceback):
pass pass

@ -51,11 +51,33 @@ else:
import thread # noqa import thread # noqa
if py3k:
import collections
ArgSpec = collections.namedtuple(
"ArgSpec",
["args", "varargs", "keywords", "defaults"])
from inspect import getfullargspec as inspect_getfullargspec
def inspect_getargspec(func):
return ArgSpec(
*inspect_getfullargspec(func)[0:4]
)
else:
from inspect import getargspec as inspect_getargspec # noqa
if py3k or jython: if py3k or jython:
import pickle import pickle
else: else:
import cPickle as pickle # noqa import cPickle as pickle # noqa
if py3k:
def read_config_file(config, fileobj):
return config.read_file(fileobj)
else:
def read_config_file(config, fileobj):
return config.readfp(fileobj)
def timedelta_total_seconds(td): def timedelta_total_seconds(td):
if py27: if py27:

@ -50,7 +50,7 @@ class NameRegistry(object):
self.creator = creator self.creator = creator
def get(self, identifier, *args, **kw): def get(self, identifier, *args, **kw):
"""Get and possibly create the value. r"""Get and possibly create the value.
:param identifier: Hash key for the value. :param identifier: Hash key for the value.
If the creation function is called, this identifier If the creation function is called, this identifier
@ -75,10 +75,12 @@ class NameRegistry(object):
if identifier in self._values: if identifier in self._values:
return self._values[identifier] return self._values[identifier]
else: else:
self._values[identifier] = value = self.creator(identifier, *args, **kw) self._values[identifier] = value = self.creator(
identifier, *args, **kw)
return value return value
except KeyError: except KeyError:
self._values[identifier] = value = self.creator(identifier, *args, **kw) self._values[identifier] = value = self.creator(
identifier, *args, **kw)
return value return value
finally: finally:
self._mutex.release() self._mutex.release()

@ -23,7 +23,7 @@ class ReadWriteMutex(object):
def __init__(self): def __init__(self):
# counts how many asynchronous methods are executing # counts how many asynchronous methods are executing
self.async = 0 self.async_ = 0
# pointer to thread that is the current sync operation # pointer to thread that is the current sync operation
self.current_sync_operation = None self.current_sync_operation = None
@ -31,7 +31,7 @@ class ReadWriteMutex(object):
# condition object to lock on # condition object to lock on
self.condition = threading.Condition(threading.Lock()) self.condition = threading.Condition(threading.Lock())
def acquire_read_lock(self, wait = True): def acquire_read_lock(self, wait=True):
"""Acquire the 'read' lock.""" """Acquire the 'read' lock."""
self.condition.acquire() self.condition.acquire()
try: try:
@ -45,7 +45,7 @@ class ReadWriteMutex(object):
if self.current_sync_operation is not None: if self.current_sync_operation is not None:
return False return False
self.async += 1 self.async_ += 1
log.debug("%s acquired read lock", self) log.debug("%s acquired read lock", self)
finally: finally:
self.condition.release() self.condition.release()
@ -57,23 +57,23 @@ class ReadWriteMutex(object):
"""Release the 'read' lock.""" """Release the 'read' lock."""
self.condition.acquire() self.condition.acquire()
try: try:
self.async -= 1 self.async_ -= 1
# check if we are the last asynchronous reader thread # check if we are the last asynchronous reader thread
# out the door. # out the door.
if self.async == 0: if self.async_ == 0:
# yes. so if a sync operation is waiting, notifyAll to wake # yes. so if a sync operation is waiting, notifyAll to wake
# it up # it up
if self.current_sync_operation is not None: if self.current_sync_operation is not None:
self.condition.notifyAll() self.condition.notifyAll()
elif self.async < 0: elif self.async_ < 0:
raise LockError("Synchronizer error - too many " raise LockError("Synchronizer error - too many "
"release_read_locks called") "release_read_locks called")
log.debug("%s released read lock", self) log.debug("%s released read lock", self)
finally: finally:
self.condition.release() self.condition.release()
def acquire_write_lock(self, wait = True): def acquire_write_lock(self, wait=True):
"""Acquire the 'write' lock.""" """Acquire the 'write' lock."""
self.condition.acquire() self.condition.acquire()
try: try:
@ -96,7 +96,7 @@ class ReadWriteMutex(object):
self.current_sync_operation = threading.currentThread() self.current_sync_operation = threading.currentThread()
# now wait again for asyncs to finish # now wait again for asyncs to finish
if self.async > 0: if self.async_ > 0:
if wait: if wait:
# wait # wait
self.condition.wait() self.condition.wait()

@ -6,8 +6,16 @@
# s/class \(\w\+\):/class \1(object):/ # s/class \(\w\+\):/class \1(object):/
# Use iterator versions of map and range: # Use iterator versions of map and range:
from itertools import imap as map try:
range = xrange from itertools import imap as map
except ImportError:
imap = map
try:
import xrange
range = xrange
except ImportError:
pass
# Except that xrange only supports machine integers, not longs, so... # Except that xrange only supports machine integers, not longs, so...
def long_range(start, end): def long_range(start, end):

@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
""" """
# Bazarr patch to use custom ConfigParser2: try:
from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
#try: except ImportError:
# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
#except ImportError:
# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
class simpleconfigparser(configparser): class simpleconfigparser(configparser):

@ -1,4 +1,4 @@
# Copyright (c) 2010-2017 Benjamin Peterson # Copyright (c) 2010-2018 Benjamin Peterson
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@ -29,7 +29,7 @@ import sys
import types import types
__author__ = "Benjamin Peterson <benjamin@python.org>" __author__ = "Benjamin Peterson <benjamin@python.org>"
__version__ = "1.11.0" __version__ = "1.12.0"
# Useful for very coarse version differentiation. # Useful for very coarse version differentiation.
@ -844,10 +844,71 @@ def add_metaclass(metaclass):
orig_vars.pop(slots_var) orig_vars.pop(slots_var)
orig_vars.pop('__dict__', None) orig_vars.pop('__dict__', None)
orig_vars.pop('__weakref__', None) orig_vars.pop('__weakref__', None)
if hasattr(cls, '__qualname__'):
orig_vars['__qualname__'] = cls.__qualname__
return metaclass(cls.__name__, cls.__bases__, orig_vars) return metaclass(cls.__name__, cls.__bases__, orig_vars)
return wrapper return wrapper
def ensure_binary(s, encoding='utf-8', errors='strict'):
"""Coerce **s** to six.binary_type.
For Python 2:
- `unicode` -> encoded to `str`
- `str` -> `str`
For Python 3:
- `str` -> encoded to `bytes`
- `bytes` -> `bytes`
"""
if isinstance(s, text_type):
return s.encode(encoding, errors)
elif isinstance(s, binary_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
def ensure_str(s, encoding='utf-8', errors='strict'):
"""Coerce *s* to `str`.
For Python 2:
- `unicode` -> encoded to `str`
- `str` -> `str`
For Python 3:
- `str` -> `str`
- `bytes` -> decoded to `str`
"""
if not isinstance(s, (text_type, binary_type)):
raise TypeError("not expecting type '%s'" % type(s))
if PY2 and isinstance(s, text_type):
s = s.encode(encoding, errors)
elif PY3 and isinstance(s, binary_type):
s = s.decode(encoding, errors)
return s
def ensure_text(s, encoding='utf-8', errors='strict'):
"""Coerce *s* to six.text_type.
For Python 2:
- `unicode` -> `unicode`
- `str` -> `unicode`
For Python 3:
- `str` -> `str`
- `bytes` -> decoded to `str`
"""
if isinstance(s, binary_type):
return s.decode(encoding, errors)
elif isinstance(s, text_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
def python_2_unicode_compatible(klass): def python_2_unicode_compatible(klass):
""" """
A decorator that defines __unicode__ and __str__ methods under Python 2. A decorator that defines __unicode__ and __str__ methods under Python 2.

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__title__ = 'subliminal' __title__ = 'subliminal'
__version__ = '2.1.0.dev' __version__ = '2.0.5'
__short_version__ = '.'.join(__version__.split('.')[:2]) __short_version__ = '.'.join(__version__.split('.')[:2])
__author__ = 'Antoine Bertin' __author__ = 'Antoine Bertin'
__license__ = 'MIT' __license__ = 'MIT'

@ -219,12 +219,13 @@ config_file = 'config.ini'
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.') @click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', @click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
help='OpenSubtitles configuration.') help='OpenSubtitles configuration.')
@click.option('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.')
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir, @click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
show_default=True, expose_value=True, help='Path to the cache directory.') show_default=True, expose_value=True, help='Path to the cache directory.')
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.') @click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
@click.version_option(__version__) @click.version_option(__version__)
@click.pass_context @click.pass_context
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug): def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug):
"""Subtitles, faster than your thoughts.""" """Subtitles, faster than your thoughts."""
# create cache directory # create cache directory
try: try:
@ -252,6 +253,8 @@ def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]} ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
if opensubtitles: if opensubtitles:
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]} ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
if subscenter:
ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]}
@subliminal.command() @subliminal.command()

@ -1,38 +1,19 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import platform
is_windows_special_path = False
if platform.system() == "Windows":
try:
__file__.decode("ascii")
except UnicodeDecodeError:
is_windows_special_path = True
if not is_windows_special_path:
from concurrent.futures import ThreadPoolExecutor
else:
ThreadPoolExecutor = object
from datetime import datetime from datetime import datetime
import io import io
import itertools import itertools
import logging import logging
import operator import operator
import os import os.path
import socket import socket
from babelfish import Language, LanguageReverseError from babelfish import Language, LanguageReverseError
from guessit import guessit from guessit import guessit
from six.moves.xmlrpc_client import ProtocolError from rarfile import NotRarFile, RarCannotExec, RarFile
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
from zipfile import BadZipfile
from ssl import SSLError
import requests import requests
from .exceptions import ServiceUnavailable
from .extensions import provider_manager, refiner_manager from .extensions import provider_manager, refiner_manager
from .score import compute_score as default_compute_score from .score import compute_score as default_compute_score
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
@ -98,18 +79,6 @@ class ProviderPool(object):
self.initialized_providers[name].terminate() self.initialized_providers[name].terminate()
except (requests.Timeout, socket.timeout): except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out, improperly terminated', name) logger.error('Provider %r timed out, improperly terminated', name)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, improperly terminated', name)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
except: except:
logger.exception('Provider %r terminated unexpectedly', name) logger.exception('Provider %r terminated unexpectedly', name)
@ -149,18 +118,6 @@ class ProviderPool(object):
return self[provider].list_subtitles(video, provider_languages) return self[provider].list_subtitles(video, provider_languages)
except (requests.Timeout, socket.timeout): except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out', provider) logger.error('Provider %r timed out', provider)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable', provider)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r http error %r', provider, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r SSL error %r', provider, e.args[0])
except: except:
logger.exception('Unexpected error in provider %r', provider) logger.exception('Unexpected error in provider %r', provider)
@ -216,28 +173,6 @@ class ProviderPool(object):
logger.error('Provider %r timed out, discarding it', subtitle.provider_name) logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name) self.discarded_providers.add(subtitle.provider_name)
return False return False
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
e.response.status_code)
self.discarded_providers.add(subtitle.provider_name)
return False
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
self.discarded_providers.add(subtitle.provider_name)
return False
except (BadRarFile, BadZipfile):
logger.error('Bad archive for %r', subtitle)
return False
except: except:
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name) logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name) self.discarded_providers.add(subtitle.provider_name)
@ -557,15 +492,9 @@ def scan_videos(path, age=None, archives=True):
continue continue
# skip old files # skip old files
try: if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age:
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath)) logger.debug('Skipping old file %r in %r', filename, dirpath)
except ValueError:
logger.warning('Could not get age of file %r in %r', filename, dirpath)
continue continue
else:
if age and datetime.utcnow() - file_age > age:
logger.debug('Skipping old file %r in %r', filename, dirpath)
continue
# scan # scan
if filename.endswith(VIDEO_EXTENSIONS): # video if filename.endswith(VIDEO_EXTENSIONS): # video
@ -612,8 +541,7 @@ def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
try: try:
refiner_manager[refiner].plugin(video, **kwargs) refiner_manager[refiner].plugin(video, **kwargs)
except: except:
logger.error('Failed to refine video %r', video.name) logger.exception('Failed to refine video')
logger.debug('Refiner exception:', exc_info=True)
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs): def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):

@ -19,8 +19,8 @@ class AuthenticationError(ProviderError):
pass pass
class ServiceUnavailable(ProviderError): class TooManyRequests(ProviderError):
"""Exception raised when status is '503 Service Unavailable'.""" """Exception raised by providers when too many requests are made."""
pass pass

@ -29,9 +29,9 @@ class RegistrableExtensionManager(ExtensionManager):
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs) super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
def list_entry_points(self): def _find_entry_points(self, namespace):
# copy of default extensions # copy of default extensions
eps = list(super(RegistrableExtensionManager, self).list_entry_points()) eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace))
# internal extensions # internal extensions
for iep in self.internal_extensions: for iep in self.internal_extensions:
@ -93,6 +93,7 @@ provider_manager = RegistrableExtensionManager('subliminal.providers', [
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider', 'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider', 'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
'shooter = subliminal.providers.shooter:ShooterProvider', 'shooter = subliminal.providers.shooter:ShooterProvider',
'subscenter = subliminal.providers.subscenter:SubsCenterProvider',
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider', 'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider' 'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
]) ])

@ -68,9 +68,6 @@ class Provider(object):
#: Required hash, if any #: Required hash, if any
required_hash = None required_hash = None
#: Subtitle class to use
subtitle_class = None
def __enter__(self): def __enter__(self):
self.initialize() self.initialize()
return self return self

@ -9,7 +9,7 @@ from requests import Session
from . import ParserBeautifulSoup, Provider from . import ParserBeautifulSoup, Provider
from .. import __short_version__ from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests
from ..score import get_equivalent_release_groups from ..score import get_equivalent_release_groups
from ..subtitle import Subtitle, fix_line_ending, guess_matches from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize, sanitize_release_group from ..utils import sanitize, sanitize_release_group
@ -19,11 +19,8 @@ logger = logging.getLogger(__name__)
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter') language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
# Series cell matching regex
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
#: Series header parsing regex #: Series header parsing regex
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$') series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')
class Addic7edSubtitle(Subtitle): class Addic7edSubtitle(Subtitle):
@ -32,7 +29,7 @@ class Addic7edSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version, def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
download_link): download_link):
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link) super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link)
self.series = series self.series = series
self.season = season self.season = season
self.episode = episode self.episode = episode
@ -48,9 +45,8 @@ class Addic7edSubtitle(Subtitle):
def get_matches(self, video): def get_matches(self, video):
matches = set() matches = set()
# series name # series
if video.series and sanitize(self.series) in ( if video.series and sanitize(self.series) == sanitize(video.series):
sanitize(name) for name in [video.series] + video.alternative_series):
matches.add('series') matches.add('series')
# season # season
if video.season and self.season == video.season: if video.season and self.season == video.season:
@ -58,7 +54,7 @@ class Addic7edSubtitle(Subtitle):
# episode # episode
if video.episode and self.episode == video.episode: if video.episode and self.episode == video.episode:
matches.add('episode') matches.add('episode')
# title of the episode # title
if video.title and sanitize(self.title) == sanitize(video.title): if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title') matches.add('title')
# year # year
@ -90,23 +86,21 @@ class Addic7edProvider(Provider):
]} ]}
video_types = (Episode,) video_types = (Episode,)
server_url = 'http://www.addic7ed.com/' server_url = 'http://www.addic7ed.com/'
subtitle_class = Addic7edSubtitle
def __init__(self, username=None, password=None): def __init__(self, username=None, password=None):
if any((username, password)) and not all((username, password)): if username is not None and password is None or username is None and password is not None:
raise ConfigurationError('Username and password must be specified') raise ConfigurationError('Username and password must be specified')
self.username = username self.username = username
self.password = password self.password = password
self.logged_in = False self.logged_in = False
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login # login
if self.username and self.password: if self.username is not None and self.password is not None:
logger.info('Logging in') logger.info('Logging in')
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
@ -140,16 +134,7 @@ class Addic7edProvider(Provider):
logger.info('Getting show ids') logger.info('Getting show ids')
r = self.session.get(self.server_url + 'shows.php', timeout=10) r = self.session.get(self.server_url + 'shows.php', timeout=10)
r.raise_for_status() r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
show_cells = re.findall(show_cells_re, r.content)
if show_cells:
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
else:
# If RegEx fails, fall back to original r.content and use 'html.parser'
soup = ParserBeautifulSoup(r.content, ['html.parser'])
# populate the show ids # populate the show ids
show_ids = {} show_ids = {}
@ -181,6 +166,8 @@ class Addic7edProvider(Provider):
logger.info('Searching show ids with %r', params) logger.info('Searching show ids with %r', params)
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
r.raise_for_status() r.raise_for_status()
if r.status_code == 304:
raise TooManyRequests()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# get the suggestion # get the suggestion
@ -231,23 +218,24 @@ class Addic7edProvider(Provider):
# search as last resort # search as last resort
if not show_id: if not show_id:
logger.warning('Series %s not found in show ids', series) logger.warning('Series not found in show ids')
show_id = self._search_show_id(series) show_id = self._search_show_id(series)
return show_id return show_id
def query(self, show_id, series, season, year=None, country=None): def query(self, series, season, year=None, country=None):
# get the show id
show_id = self.get_show_id(series, year, country)
if show_id is None:
logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country})
return []
# get the page of the season of the show # get the page of the season of the show
logger.info('Getting the page of show id %d, season %d', show_id, season) logger.info('Getting the page of show id %d, season %d', show_id, season)
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
r.raise_for_status() r.raise_for_status()
if r.status_code == 304:
if not r.content: raise TooManyRequests()
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('No data returned from provider')
return []
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# loop over subtitle rows # loop over subtitle rows
@ -274,32 +262,16 @@ class Addic7edProvider(Provider):
version = cells[4].text version = cells[4].text
download_link = cells[9].a['href'][1:] download_link = cells[9].a['href'][1:]
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year,
version, download_link) version, download_link)
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle) subtitles.append(subtitle)
return subtitles return subtitles
def list_subtitles(self, video, languages): def list_subtitles(self, video, languages):
# lookup show_id return [s for s in self.query(video.series, video.season, video.year)
titles = [video.series] + video.alternative_series if s.language in languages and s.episode == video.episode]
show_id = None
for title in titles:
show_id = self.get_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
def download_subtitle(self, subtitle): def download_subtitle(self, subtitle):
# download the subtitle # download the subtitle
@ -308,12 +280,6 @@ class Addic7edProvider(Provider):
timeout=10) timeout=10)
r.raise_for_status() r.raise_for_status()
if not r.content:
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('Unable to download subtitle. No data returned from provider')
return
# detect download limit exceeded # detect download limit exceeded
if r.headers['Content-Type'] == 'text/html': if r.headers['Content-Type'] == 'text/html':
raise DownloadLimitExceeded raise DownloadLimitExceeded

@ -18,7 +18,7 @@ from zipfile import ZipFile, is_zipfile
from . import ParserBeautifulSoup, Provider from . import ParserBeautifulSoup, Provider
from .. import __short_version__ from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
from ..video import Episode, Movie from ..video import Episode, Movie
@ -44,11 +44,8 @@ rating_re = re.compile(r'nota (?P<rating>\d+)')
#: Timestamp parsing regex #: Timestamp parsing regex
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)') timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
#: Title with year/country regex
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
#: Cache key for releases #: Cache key for releases
releases_key = __name__ + ':releases|{archive_id}|{archive_name}' releases_key = __name__ + ':releases|{archive_id}'
class LegendasTVArchive(object): class LegendasTVArchive(object):
@ -63,8 +60,8 @@ class LegendasTVArchive(object):
:param int rating: rating (0-10). :param int rating: rating (0-10).
:param timestamp: timestamp. :param timestamp: timestamp.
:type timestamp: datetime.datetime :type timestamp: datetime.datetime
"""
"""
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None): def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
#: Identifier #: Identifier
self.id = id self.id = id
@ -99,11 +96,10 @@ class LegendasTVArchive(object):
class LegendasTVSubtitle(Subtitle): class LegendasTVSubtitle(Subtitle):
"""LegendasTV Subtitle.""" """LegendasTV Subtitle."""
provider_name = 'legendastv' provider_name = 'legendastv'
def __init__(self, language, type, title, year, imdb_id, season, archive, name): def __init__(self, language, type, title, year, imdb_id, season, archive, name):
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link) super(LegendasTVSubtitle, self).__init__(language, archive.link)
self.type = type self.type = type
self.title = title self.title = title
self.year = year self.year = year
@ -122,12 +118,11 @@ class LegendasTVSubtitle(Subtitle):
# episode # episode
if isinstance(video, Episode) and self.type == 'episode': if isinstance(video, Episode) and self.type == 'episode':
# series # series
if video.series and (sanitize(self.title) in ( if video.series and sanitize(self.title) == sanitize(video.series):
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series') matches.add('series')
# year # year (year is based on season air date hence the adjustment)
if video.original_series and self.year is None or video.year and video.year == self.year: if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1:
matches.add('year') matches.add('year')
# imdb_id # imdb_id
@ -137,8 +132,7 @@ class LegendasTVSubtitle(Subtitle):
# movie # movie
elif isinstance(video, Movie) and self.type == 'movie': elif isinstance(video, Movie) and self.type == 'movie':
# title # title
if video.title and (sanitize(self.title) in ( if video.title and sanitize(self.title) == sanitize(video.title):
sanitize(name) for name in [video.title] + video.alternative_titles)):
matches.add('title') matches.add('title')
# year # year
@ -149,6 +143,9 @@ class LegendasTVSubtitle(Subtitle):
if video.imdb_id and self.imdb_id == video.imdb_id: if video.imdb_id and self.imdb_id == video.imdb_id:
matches.add('imdb_id') matches.add('imdb_id')
# archive name
matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type}))
# name # name
matches |= guess_matches(video, guessit(self.name, {'type': self.type})) matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
@ -160,38 +157,29 @@ class LegendasTVProvider(Provider):
:param str username: username. :param str username: username.
:param str password: password. :param str password: password.
"""
"""
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes} languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
server_url = 'http://legendas.tv/' server_url = 'http://legendas.tv/'
subtitle_class = LegendasTVSubtitle
def __init__(self, username=None, password=None): def __init__(self, username=None, password=None):
if username and not password or not username and password:
# Provider needs UNRAR installed. If not available raise ConfigurationError
try:
rarfile.custom_check(rarfile.UNRAR_TOOL)
except rarfile.RarExecError:
raise ConfigurationError('UNRAR tool not available')
if any((username, password)) and not all((username, password)):
raise ConfigurationError('Username and password must be specified') raise ConfigurationError('Username and password must be specified')
self.username = username self.username = username
self.password = password self.password = password
self.logged_in = False self.logged_in = False
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login # login
if self.username and self.password: if self.username is not None and self.password is not None:
logger.info('Logging in') logger.info('Logging in')
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password} data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
raise_for_status(r) r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['html.parser']) soup = ParserBeautifulSoup(r.content, ['html.parser'])
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
@ -205,174 +193,94 @@ class LegendasTVProvider(Provider):
if self.logged_in: if self.logged_in:
logger.info('Logging out') logger.info('Logging out')
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10) r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
raise_for_status(r) r.raise_for_status()
logger.debug('Logged out') logger.debug('Logged out')
self.logged_in = False self.logged_in = False
self.session.close() self.session.close()
@staticmethod @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def is_valid_title(title, title_id, sanitized_title, season, year): def search_titles(self, title):
"""Check if is a valid title."""
sanitized_result = sanitize(title['title'])
if sanitized_result != sanitized_title:
logger.debug("Mismatched title, discarding title %d (%s)",
title_id, sanitized_result)
return
# episode type
if season:
# discard mismatches on type
if title['type'] != 'episode':
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on season
if 'season' not in title or title['season'] != season:
logger.debug('Mismatched season %s, discarding title %d (%s)',
title.get('season'), title_id, sanitized_result)
return
# movie type
else:
# discard mismatches on type
if title['type'] != 'movie':
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on year
if year is not None and 'year' in title and title['year'] != year:
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
return
return True
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
def search_titles(self, title, season, title_year):
"""Search for titles matching the `title`. """Search for titles matching the `title`.
For episodes, each season has it own title
:param str title: the title to search for. :param str title: the title to search for.
:param int season: season of the title
:param int title_year: year of the title
:return: found titles. :return: found titles.
:rtype: dict :rtype: dict
"""
titles = {}
sanitized_titles = [sanitize(title)]
ignore_characters = {'\'', '.'}
if any(c in title for c in ignore_characters):
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
for sanitized_title in sanitized_titles:
# make the query
if season:
logger.info('Searching episode title %r for season %r', sanitized_title, season)
else:
logger.info('Searching movie title %r', sanitized_title)
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
raise_for_status(r)
results = json.loads(r.text)
# loop over results
for result in results:
source = result['_source']
# extract id
title_id = int(source['id_filme'])
# extract type
title = {'type': type_map[source['tipo']]}
# extract title, year and country """
name, year, country = title_re.match(source['dsc_nome']).groups() # make the query
title['title'] = name logger.info('Searching title %r', title)
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10)
r.raise_for_status()
results = json.loads(r.text)
# extract imdb_id # loop over results
if source['id_imdb'] != '0': titles = {}
if not source['id_imdb'].startswith('tt'): for result in results:
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7) source = result['_source']
# extract id
title_id = int(source['id_filme'])
# extract type and title
title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']}
# extract year
if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
title['year'] = int(source['dsc_data_lancamento'])
# extract imdb_id
if source['id_imdb'] != '0':
if not source['id_imdb'].startswith('tt'):
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
else:
title['imdb_id'] = source['id_imdb']
# extract season
if title['type'] == 'episode':
if source['temporada'] and source['temporada'].isdigit():
title['season'] = int(source['temporada'])
else:
match = season_re.search(source['dsc_nome_br'])
if match:
title['season'] = int(match.group('season'))
else: else:
title['imdb_id'] = source['id_imdb'] logger.warning('No season detected for title %d', title_id)
# extract season # add title
if title['type'] == 'episode': titles[title_id] = title
if source['temporada'] and source['temporada'].isdigit():
title['season'] = int(source['temporada']) logger.debug('Found %d titles', len(titles))
else:
match = season_re.search(source['dsc_nome_br'])
if match:
title['season'] = int(match.group('season'))
else:
logger.debug('No season detected for title %d (%s)', title_id, name)
# extract year
if year:
title['year'] = int(year)
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
# year is based on season air date hence the adjustment
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
# add title only if is valid
# Check against title without ignored chars
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
titles[title_id] = title
logger.debug('Found %d titles', len(titles))
return titles return titles
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds()) @region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
def get_archives(self, title_id, language_code, title_type, season, episode): def get_archives(self, title_id, language_code):
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`. """Get the archive list from a given `title_id` and `language_code`.
:param int title_id: title id. :param int title_id: title id.
:param int language_code: language code. :param int language_code: language code.
:param str title_type: episode or movie
:param int season: season
:param int episode: episode
:return: the archives. :return: the archives.
:rtype: list of :class:`LegendasTVArchive` :rtype: list of :class:`LegendasTVArchive`
""" """
logger.info('Getting archives for title %d and language %d', title_id, language_code)
archives = [] archives = []
page = 0 page = 1
while True: while True:
# get the archive page # get the archive page
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format( url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format(
language=language_code, page=page, title=title_id) title=title_id, language=language_code, page=page)
r = self.session.get(url) r = self.session.get(url)
raise_for_status(r) r.raise_for_status()
# parse the results # parse the results
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
for archive_soup in soup.select('div.list_element > article > div > div.f_left'): for archive_soup in soup.select('div.list_element > article > div'):
# create archive # create archive
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text,
archive_soup.a.text, 'pack' in archive_soup['class'], 'destaque' in archive_soup['class'],
'pack' in archive_soup.parent['class'],
'destaque' in archive_soup.parent['class'],
self.server_url + archive_soup.a['href'][1:]) self.server_url + archive_soup.a['href'][1:])
# clean name of path separators and pack flags
clean_name = archive.name.replace('/', '-')
if archive.pack and clean_name.startswith('(p)'):
clean_name = clean_name[3:]
# guess from name
guess = guessit(clean_name, {'type': title_type})
# episode
if season and episode:
# discard mismatches on episode in non-pack archives
# Guessit may return int for single episode or list for multi-episode
# Check if archive name has multiple episodes releases on it
if not archive.pack and 'episode' in guess:
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
if not wanted_episode.intersection(archive_episode):
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
continue
# extract text containing downloads, rating and timestamp # extract text containing downloads, rating and timestamp
data_text = archive_soup.find('p', class_='data').text data_text = archive_soup.find('p', class_='data').text
@ -392,8 +300,6 @@ class LegendasTVProvider(Provider):
raise ProviderError('Archive timestamp is in the future') raise ProviderError('Archive timestamp is in the future')
# add archive # add archive
logger.info('Found archive for title %d and language %d at page %s: %s',
title_id, language_code, page, archive)
archives.append(archive) archives.append(archive)
# stop on last page # stop on last page
@ -416,7 +322,7 @@ class LegendasTVProvider(Provider):
""" """
logger.info('Downloading archive %s', archive.id) logger.info('Downloading archive %s', archive.id)
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id)) r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
raise_for_status(r) r.raise_for_status()
# open the archive # open the archive
archive_stream = io.BytesIO(r.content) archive_stream = io.BytesIO(r.content)
@ -431,26 +337,60 @@ class LegendasTVProvider(Provider):
def query(self, language, title, season=None, episode=None, year=None): def query(self, language, title, season=None, episode=None, year=None):
# search for titles # search for titles
titles = self.search_titles(title, season, year) titles = self.search_titles(sanitize(title))
# search for titles with the quote or dot character
ignore_characters = {'\'', '.'}
if any(c in title for c in ignore_characters):
titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters)))
subtitles = [] subtitles = []
# iterate over titles # iterate over titles
for title_id, t in titles.items(): for title_id, t in titles.items():
# discard mismatches on title
if sanitize(t['title']) != sanitize(title):
continue
# episode
if season and episode:
# discard mismatches on type
if t['type'] != 'episode':
continue
# discard mismatches on season
if 'season' not in t or t['season'] != season:
continue
# movie
else:
# discard mismatches on type
if t['type'] != 'movie':
continue
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv) # discard mismatches on year
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode) if year is not None and 'year' in t and t['year'] != year:
if not archives: continue
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
# iterate over title's archives # iterate over title's archives
for a in archives: for a in self.get_archives(title_id, language.legendastv):
# clean name of path separators and pack flags
clean_name = a.name.replace('/', '-')
if a.pack and clean_name.startswith('(p)'):
clean_name = clean_name[3:]
# guess from name
guess = guessit(clean_name, {'type': t['type']})
# episode
if season and episode:
# discard mismatches on episode in non-pack archives
if not a.pack and 'episode' in guess and guess['episode'] != episode:
continue
# compute an expiration time based on the archive timestamp # compute an expiration time based on the archive timestamp
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds() expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
# attempt to get the releases from the cache # attempt to get the releases from the cache
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name) releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time)
releases = region.get(cache_key, expiration_time=expiration_time)
# the releases are not in cache or cache is expired # the releases are not in cache or cache is expired
if releases == NO_VALUE: if releases == NO_VALUE:
@ -477,12 +417,12 @@ class LegendasTVProvider(Provider):
releases.append(name) releases.append(name)
# cache the releases # cache the releases
region.set(cache_key, releases) region.set(releases_key.format(archive_id=a.id), releases)
# iterate over releases # iterate over releases
for r in releases: for r in releases:
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'), subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
t.get('season'), a, r) t.get('season'), a, r)
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle) subtitles.append(subtitle)
@ -491,19 +431,13 @@ class LegendasTVProvider(Provider):
def list_subtitles(self, video, languages): def list_subtitles(self, video, languages):
season = episode = None season = episode = None
if isinstance(video, Episode): if isinstance(video, Episode):
titles = [video.series] + video.alternative_series title = video.series
season = video.season season = video.season
episode = video.episode episode = video.episode
else: else:
titles = [video.title] + video.alternative_titles title = video.title
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
return [] return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)]
def download_subtitle(self, subtitle): def download_subtitle(self, subtitle):
# download archive in case we previously hit the releases cache and didn't download it # download archive in case we previously hit the releases cache and didn't download it
@ -512,11 +446,3 @@ class LegendasTVProvider(Provider):
# extract subtitle's content # extract subtitle's content
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name)) subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
def raise_for_status(r):
# When site is under maintaince and http status code 200.
if 'Em breve estaremos de volta' in r.text:
raise ServiceUnavailable
else:
r.raise_for_status()

@ -42,7 +42,6 @@ class NapiProjektSubtitle(Subtitle):
def __init__(self, language, hash): def __init__(self, language, hash):
super(NapiProjektSubtitle, self).__init__(language) super(NapiProjektSubtitle, self).__init__(language)
self.hash = hash self.hash = hash
self.content = None
@property @property
def id(self): def id(self):
@ -63,10 +62,6 @@ class NapiProjektProvider(Provider):
languages = {Language.fromalpha2(l) for l in ['pl']} languages = {Language.fromalpha2(l) for l in ['pl']}
required_hash = 'napiprojekt' required_hash = 'napiprojekt'
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php' server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
subtitle_class = NapiProjektSubtitle
def __init__(self):
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
@ -86,16 +81,16 @@ class NapiProjektProvider(Provider):
'f': hash, 'f': hash,
't': get_subhash(hash)} 't': get_subhash(hash)}
logger.info('Searching subtitle %r', params) logger.info('Searching subtitle %r', params)
r = self.session.get(self.server_url, params=params, timeout=10) response = self.session.get(self.server_url, params=params, timeout=10)
r.raise_for_status() response.raise_for_status()
# handle subtitles not found and errors # handle subtitles not found and errors
if r.content[:4] == b'NPc0': if response.content[:4] == b'NPc0':
logger.debug('No subtitles found') logger.debug('No subtitles found')
return None return None
subtitle = self.subtitle_class(language, hash) subtitle = NapiProjektSubtitle(language, hash)
subtitle.content = r.content subtitle.content = response.content
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
return subtitle return subtitle

@ -11,8 +11,7 @@ from six.moves.xmlrpc_client import ServerProxy
from . import Provider, TimeoutSafeTransport from . import Provider, TimeoutSafeTransport
from .. import __short_version__ from .. import __short_version__
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError, from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError
ServiceUnavailable)
from ..subtitle import Subtitle, fix_line_ending, guess_matches from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize from ..utils import sanitize
from ..video import Episode, Movie from ..video import Episode, Movie
@ -27,8 +26,7 @@ class OpenSubtitlesSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name, def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding): movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding)
page_link=page_link, encoding=encoding)
self.subtitle_id = subtitle_id self.subtitle_id = subtitle_id
self.matched_by = matched_by self.matched_by = matched_by
self.movie_kind = movie_kind self.movie_kind = movie_kind
@ -60,8 +58,7 @@ class OpenSubtitlesSubtitle(Subtitle):
if isinstance(video, Episode) and self.movie_kind == 'episode': if isinstance(video, Episode) and self.movie_kind == 'episode':
# tag match, assume series, year, season and episode matches # tag match, assume series, year, season and episode matches
if self.matched_by == 'tag': if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id: matches |= {'series', 'year', 'season', 'episode'}
matches |= {'series', 'year', 'season', 'episode'}
# series # series
if video.series and sanitize(self.series_name) == sanitize(video.series): if video.series and sanitize(self.series_name) == sanitize(video.series):
matches.add('series') matches.add('series')
@ -90,8 +87,7 @@ class OpenSubtitlesSubtitle(Subtitle):
elif isinstance(video, Movie) and self.movie_kind == 'movie': elif isinstance(video, Movie) and self.movie_kind == 'movie':
# tag match, assume title and year matches # tag match, assume title and year matches
if self.matched_by == 'tag': if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id: matches |= {'title', 'year'}
matches |= {'title', 'year'}
# title # title
if video.title and sanitize(self.movie_name) == sanitize(video.title): if video.title and sanitize(self.movie_name) == sanitize(video.title):
matches.add('title') matches.add('title')
@ -126,11 +122,10 @@ class OpenSubtitlesProvider(Provider):
""" """
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes} languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
subtitle_class = OpenSubtitlesSubtitle
def __init__(self, username=None, password=None): def __init__(self, username=None, password=None):
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10)) self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
if any((username, password)) and not all((username, password)): if username and not password or not username and password:
raise ConfigurationError('Username and password must be specified') raise ConfigurationError('Username and password must be specified')
# None values not allowed for logging in, so replace it by '' # None values not allowed for logging in, so replace it by ''
self.username = username or '' self.username = username or ''
@ -161,10 +156,7 @@ class OpenSubtitlesProvider(Provider):
if hash and size: if hash and size:
criteria.append({'moviehash': hash, 'moviebytesize': str(size)}) criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
if imdb_id: if imdb_id:
if season and episode: criteria.append({'imdbid': imdb_id[2:]})
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
else:
criteria.append({'imdbid': imdb_id[2:]})
if tag: if tag:
criteria.append({'tag': tag}) criteria.append({'tag': tag})
if query and season and episode: if query and season and episode:
@ -207,9 +199,9 @@ class OpenSubtitlesProvider(Provider):
filename = subtitle_item['SubFileName'] filename = subtitle_item['SubFileName']
encoding = subtitle_item.get('SubEncoding') or None encoding = subtitle_item.get('SubEncoding') or None
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
hash, movie_name, movie_release_name, movie_year, movie_imdb_id, hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
series_season, series_episode, filename, encoding) series_season, series_episode, filename, encoding)
logger.debug('Found subtitle %r by %s', subtitle, matched_by) logger.debug('Found subtitle %r by %s', subtitle, matched_by)
subtitles.append(subtitle) subtitles.append(subtitle)
@ -268,6 +260,11 @@ class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
pass pass
class ServiceUnavailable(OpenSubtitlesError):
"""Exception raised when status is '503 Service Unavailable'."""
pass
def checked(response): def checked(response):
"""Check a response status before returning it. """Check a response status before returning it.

@ -31,7 +31,7 @@ class PodnapisiSubtitle(Subtitle):
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None, def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
year=None): year=None):
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link) super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link)
self.pid = pid self.pid = pid
self.releases = releases self.releases = releases
self.title = title self.title = title
@ -49,8 +49,7 @@ class PodnapisiSubtitle(Subtitle):
# episode # episode
if isinstance(video, Episode): if isinstance(video, Episode):
# series # series
if video.series and (sanitize(self.title) in ( if video.series and sanitize(self.title) == sanitize(video.series):
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series') matches.add('series')
# year # year
if video.original_series and self.year is None or video.year and video.year == self.year: if video.original_series and self.year is None or video.year and video.year == self.year:
@ -67,8 +66,7 @@ class PodnapisiSubtitle(Subtitle):
# movie # movie
elif isinstance(video, Movie): elif isinstance(video, Movie):
# title # title
if video.title and (sanitize(self.title) in ( if video.title and sanitize(self.title) == sanitize(video.title):
sanitize(name) for name in [video.title] + video.alternative_titles)):
matches.add('title') matches.add('title')
# year # year
if video.year and self.year == video.year: if video.year and self.year == video.year:
@ -84,11 +82,7 @@ class PodnapisiProvider(Provider):
"""Podnapisi Provider.""" """Podnapisi Provider."""
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} | languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes}) {Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
server_url = 'https://www.podnapisi.net/subtitles/' server_url = 'http://podnapisi.net/subtitles/'
subtitle_class = PodnapisiSubtitle
def __init__(self):
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
@ -114,9 +108,7 @@ class PodnapisiProvider(Provider):
pids = set() pids = set()
while True: while True:
# query the server # query the server
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10) xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content)
r.raise_for_status()
xml = etree.fromstring(r.content)
# exit if no results # exit if no results
if not int(xml.find('pagination/results').text): if not int(xml.find('pagination/results').text):
@ -126,14 +118,10 @@ class PodnapisiProvider(Provider):
# loop over subtitles # loop over subtitles
for subtitle_xml in xml.findall('subtitle'): for subtitle_xml in xml.findall('subtitle'):
# read xml elements # read xml elements
pid = subtitle_xml.find('pid').text
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
if pid in pids:
continue
language = Language.fromietf(subtitle_xml.find('language').text) language = Language.fromietf(subtitle_xml.find('language').text)
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
page_link = subtitle_xml.find('url').text page_link = subtitle_xml.find('url').text
pid = subtitle_xml.find('pid').text
releases = [] releases = []
if subtitle_xml.find('release').text: if subtitle_xml.find('release').text:
for release in subtitle_xml.find('release').text.split(): for release in subtitle_xml.find('release').text.split():
@ -146,11 +134,15 @@ class PodnapisiProvider(Provider):
year = int(subtitle_xml.find('year').text) year = int(subtitle_xml.find('year').text)
if is_episode: if is_episode:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
season=season, episode=episode, year=year) season=season, episode=episode, year=year)
else: else:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
year=year) year=year)
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
if pid in pids:
continue
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle) subtitles.append(subtitle)
@ -167,21 +159,11 @@ class PodnapisiProvider(Provider):
return subtitles return subtitles
def list_subtitles(self, video, languages): def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode): if isinstance(video, Episode):
titles = [video.series] + video.alternative_series return [s for l in languages for s in self.query(l, video.series, season=video.season,
season = video.season episode=video.episode, year=video.year)]
episode = video.episode elif isinstance(video, Movie):
else: return [s for l in languages for s in self.query(l, video.title, year=video.year)]
titles = [video.title] + video.alternative_titles
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
return []
def download_subtitle(self, subtitle): def download_subtitle(self, subtitle):
# download as a zip # download as a zip

@ -42,10 +42,6 @@ class ShooterProvider(Provider):
"""Shooter Provider.""" """Shooter Provider."""
languages = {Language(l) for l in ['eng', 'zho']} languages = {Language(l) for l in ['eng', 'zho']}
server_url = 'https://www.shooter.cn/api/subapi.php' server_url = 'https://www.shooter.cn/api/subapi.php'
subtitle_class = ShooterSubtitle
def __init__(self):
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
@ -68,7 +64,7 @@ class ShooterProvider(Provider):
# parse the subtitles # parse the subtitles
results = json.loads(r.text) results = json.loads(r.text)
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']] subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']]
return subtitles return subtitles

@ -26,7 +26,7 @@ class SubsCenterSubtitle(Subtitle):
provider_name = 'subscenter' provider_name = 'subscenter'
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key, def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
subtitle_version, downloaded, releases): downloaded, releases):
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link) super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
self.series = series self.series = series
self.season = season self.season = season
@ -34,7 +34,6 @@ class SubsCenterSubtitle(Subtitle):
self.title = title self.title = title
self.subtitle_id = subtitle_id self.subtitle_id = subtitle_id
self.subtitle_key = subtitle_key self.subtitle_key = subtitle_key
self.subtitle_version = subtitle_version
self.downloaded = downloaded self.downloaded = downloaded
self.releases = releases self.releases = releases
@ -75,8 +74,7 @@ class SubsCenterSubtitle(Subtitle):
class SubsCenterProvider(Provider): class SubsCenterProvider(Provider):
"""SubsCenter Provider.""" """SubsCenter Provider."""
languages = {Language.fromalpha2(l) for l in ['he']} languages = {Language.fromalpha2(l) for l in ['he']}
server_url = 'http://www.subscenter.org/he/' server_url = 'http://www.subscenter.co/he/'
subtitle_class = SubsCenterSubtitle
def __init__(self, username=None, password=None): def __init__(self, username=None, password=None):
if username is not None and password is None or username is None and password is not None: if username is not None and password is None or username is None and password is not None:
@ -191,7 +189,6 @@ class SubsCenterProvider(Provider):
hearing_impaired = bool(subtitle_item['hearing_impaired']) hearing_impaired = bool(subtitle_item['hearing_impaired'])
subtitle_id = subtitle_item['id'] subtitle_id = subtitle_item['id']
subtitle_key = subtitle_item['key'] subtitle_key = subtitle_item['key']
subtitle_version = subtitle_item['h_version']
downloaded = subtitle_item['downloaded'] downloaded = subtitle_item['downloaded']
release = subtitle_item['subtitle_version'] release = subtitle_item['subtitle_version']
@ -203,9 +200,8 @@ class SubsCenterProvider(Provider):
continue continue
# otherwise create it # otherwise create it
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode, subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode,
title, subtitle_id, subtitle_key, subtitle_version, downloaded, title, subtitle_id, subtitle_key, downloaded, [release])
[release])
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
subtitles[subtitle_id] = subtitle subtitles[subtitle_id] = subtitle
@ -225,19 +221,15 @@ class SubsCenterProvider(Provider):
def download_subtitle(self, subtitle): def download_subtitle(self, subtitle):
# download # download
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id) url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key} params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key}
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10) r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
r.raise_for_status() r.raise_for_status()
# open the zip # open the zip
try: with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
with zipfile.ZipFile(io.BytesIO(r.content)) as zf: # remove some filenames from the namelist
# remove some filenames from the namelist namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
namelist = [n for n in zf.namelist() if not n.endswith('.txt')] if len(namelist) > 1:
if len(namelist) > 1: raise ProviderError('More than one file to unzip')
raise ProviderError('More than one file to unzip')
subtitle.content = fix_line_ending(zf.read(namelist[0]))
subtitle.content = fix_line_ending(zf.read(namelist[0]))
except zipfile.BadZipfile:
# if no zip file was retrieved, daily downloads limit has exceeded
raise ProviderError('Daily limit exceeded')

@ -40,10 +40,6 @@ class TheSubDBProvider(Provider):
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes} languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
required_hash = 'thesubdb' required_hash = 'thesubdb'
server_url = 'http://api.thesubdb.com/' server_url = 'http://api.thesubdb.com/'
subtitle_class = TheSubDBSubtitle
def __init__(self):
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
@ -70,7 +66,7 @@ class TheSubDBProvider(Provider):
for language_code in r.text.split(','): for language_code in r.text.split(','):
language = Language.fromthesubdb(language_code) language = Language.fromthesubdb(language_code)
subtitle = self.subtitle_class(language, hash) subtitle = TheSubDBSubtitle(language, hash)
logger.debug('Found subtitle %r', subtitle) logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle) subtitles.append(subtitle)

@ -47,8 +47,7 @@ class TVsubtitlesSubtitle(Subtitle):
matches = set() matches = set()
# series # series
if video.series and (sanitize(self.series) in ( if video.series and sanitize(self.series) == sanitize(video.series):
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series') matches.add('series')
# season # season
if video.season and self.season == video.season: if video.season and self.season == video.season:
@ -81,10 +80,6 @@ class TVsubtitlesProvider(Provider):
]} ]}
video_types = (Episode,) video_types = (Episode,)
server_url = 'http://www.tvsubtitles.net/' server_url = 'http://www.tvsubtitles.net/'
subtitle_class = TVsubtitlesSubtitle
def __init__(self):
self.session = None
def initialize(self): def initialize(self):
self.session = Session() self.session = Session()
@ -163,7 +158,13 @@ class TVsubtitlesProvider(Provider):
return episode_ids return episode_ids
def query(self, show_id, series, season, episode, year=None): def query(self, series, season, episode, year=None):
# search the show id
show_id = self.search_show_id(series, year)
if show_id is None:
logger.error('No show id found for %r (%r)', series, {'year': year})
return []
# get the episode ids # get the episode ids
episode_ids = self.get_episode_ids(show_id, season) episode_ids = self.get_episode_ids(show_id, season)
if episode not in episode_ids: if episode not in episode_ids:
@ -183,9 +184,9 @@ class TVsubtitlesProvider(Provider):
subtitle_id = int(row.parent['href'][10:-5]) subtitle_id = int(row.parent['href'][10:-5])
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
rip = row.find('p', title='rip').text.strip() or None rip = row.find('p', title='rip').text.strip() or None
release = row.find('h5').text.strip() or None release = row.find('p', title='release').text.strip() or None
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip,
release) release)
logger.debug('Found subtitle %s', subtitle) logger.debug('Found subtitle %s', subtitle)
subtitles.append(subtitle) subtitles.append(subtitle)
@ -193,24 +194,7 @@ class TVsubtitlesProvider(Provider):
return subtitles return subtitles
def list_subtitles(self, video, languages): def list_subtitles(self, video, languages):
# lookup show_id return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages]
titles = [video.series] + video.alternative_series
show_id = None
for title in titles:
show_id = self.search_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
def download_subtitle(self, subtitle): def download_subtitle(self, subtitle):
# download as a zip # download as a zip

@ -3,7 +3,7 @@ from datetime import datetime, timedelta
from functools import wraps from functools import wraps
import logging import logging
import re import re
import _strptime
import requests import requests
from .. import __short_version__ from .. import __short_version__
@ -331,7 +331,6 @@ def refine(video, **kwargs):
# add series information # add series information
logger.debug('Found series %r', series) logger.debug('Found series %r', series)
video.series = matching_result['match']['series'] video.series = matching_result['match']['series']
video.alternative_series.extend(series['aliases'])
video.year = matching_result['match']['year'] video.year = matching_result['match']['year']
video.original_series = matching_result['match']['original_series'] video.original_series = matching_result['match']['original_series']
video.series_tvdb_id = series['id'] video.series_tvdb_id = series['id']

@ -44,7 +44,7 @@ movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1} 'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
#: Equivalent release groups #: Equivalent release groups
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'}) equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'})
def get_equivalent_release_groups(release_group): def get_equivalent_release_groups(release_group):

@ -208,14 +208,8 @@ def guess_matches(video, guess, partial=False):
if video.season and 'season' in guess and guess['season'] == video.season: if video.season and 'season' in guess and guess['season'] == video.season:
matches.add('season') matches.add('season')
# episode # episode
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values) if video.episode and 'episode' in guess and guess['episode'] == video.episode:
# Most providers only support single-ep, so make sure it contains only 1 episode matches.add('episode')
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
if video.episode and 'episode' in guess:
episode_guess = guess['episode']
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
if episode == video.episode:
matches.add('episode')
# year # year
if video.year and 'year' in guess and guess['year'] == video.year: if video.year and 'year' in guess and guess['year'] == video.year:
matches.add('year') matches.add('year')
@ -258,4 +252,4 @@ def fix_line_ending(content):
:rtype: bytes :rtype: bytes
""" """
return content.replace(b'\r\n', b'\n') return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')

@ -13,9 +13,9 @@ VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli', '.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e', '.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4', '.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf', '.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm' '.ogv', '.omf',
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo', '.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid') '.vob', '.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
class Video(object): class Video(object):
@ -123,12 +123,11 @@ class Episode(Video):
:param int year: year of the series. :param int year: year of the series.
:param bool original_series: whether the series is the first with this name. :param bool original_series: whether the series is the first with this name.
:param int tvdb_id: TVDB id of the episode. :param int tvdb_id: TVDB id of the episode.
:param list alternative_series: alternative names of the series
:param \*\*kwargs: additional parameters for the :class:`Video` constructor. :param \*\*kwargs: additional parameters for the :class:`Video` constructor.
""" """
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None, def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs): series_tvdb_id=None, series_imdb_id=None, **kwargs):
super(Episode, self).__init__(name, **kwargs) super(Episode, self).__init__(name, **kwargs)
#: Series of the episode #: Series of the episode
@ -158,9 +157,6 @@ class Episode(Video):
#: IMDb id of the series #: IMDb id of the series
self.series_imdb_id = series_imdb_id self.series_imdb_id = series_imdb_id
#: Alternative names of the series
self.alternative_series = alternative_series or []
@classmethod @classmethod
def fromguess(cls, name, guess): def fromguess(cls, name, guess):
if guess['type'] != 'episode': if guess['type'] != 'episode':
@ -169,13 +165,7 @@ class Episode(Video):
if 'title' not in guess or 'episode' not in guess: if 'title' not in guess or 'episode' not in guess:
raise ValueError('Insufficient data to process the guess') raise ValueError('Insufficient data to process the guess')
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values) return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'),
# Most providers only support single-ep, so make sure it contains only 1 episode
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
episode_guess = guess.get('episode')
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess, year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
release_group=guess.get('release_group'), resolution=guess.get('screen_size'), release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec')) video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
@ -196,11 +186,10 @@ class Movie(Video):
:param str title: title of the movie. :param str title: title of the movie.
:param int year: year of the movie. :param int year: year of the movie.
:param list alternative_titles: alternative titles of the movie
:param \*\*kwargs: additional parameters for the :class:`Video` constructor. :param \*\*kwargs: additional parameters for the :class:`Video` constructor.
""" """
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs): def __init__(self, name, title, year=None, **kwargs):
super(Movie, self).__init__(name, **kwargs) super(Movie, self).__init__(name, **kwargs)
#: Title of the movie #: Title of the movie
@ -209,9 +198,6 @@ class Movie(Video):
#: Year of the movie #: Year of the movie
self.year = year self.year = year
#: Alternative titles of the movie
self.alternative_titles = alternative_titles or []
@classmethod @classmethod
def fromguess(cls, name, guess): def fromguess(cls, name, guess):
if guess['type'] != 'movie': if guess['type'] != 'movie':
@ -220,13 +206,9 @@ class Movie(Video):
if 'title' not in guess: if 'title' not in guess:
raise ValueError('Insufficient data to process the guess') raise ValueError('Insufficient data to process the guess')
alternative_titles = []
if 'alternative_title' in guess:
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'), return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'), resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles) audio_codec=guess.get('audio_codec'), year=guess.get('year'))
@classmethod @classmethod
def fromname(cls, name): def fromname(cls, name):

@ -10,7 +10,7 @@ import time
import operator import operator
import itertools import itertools
from httplib import ResponseNotReady from http.client import ResponseNotReady
import rarfile import rarfile
import requests import requests
@ -21,14 +21,13 @@ from babelfish import LanguageReverseError
from guessit.jsonutils import GuessitEncoder from guessit.jsonutils import GuessitEncoder
from subliminal import ProviderError, refiner_manager from subliminal import ProviderError, refiner_manager
from extensions import provider_registry from subliminal_patch.extensions import provider_registry
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded
from subliminal.score import compute_score as default_compute_score from subliminal.score import compute_score as default_compute_score
from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \ from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
ThreadPoolExecutor, check_video ThreadPoolExecutor, check_video
from subliminal_patch.exceptions import TooManyRequests, APIThrottled from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ServiceUnavailable, DownloadLimitExceeded
from subzero.language import Language from subzero.language import Language
from scandir import scandir, scandir_generic as _scandir_generic from scandir import scandir, scandir_generic as _scandir_generic
@ -186,7 +185,7 @@ class SZProviderPool(ProviderPool):
except (requests.Timeout, socket.timeout): except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out', provider) logger.error('Provider %r timed out', provider)
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e: except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
self.throttle_callback(provider, e) self.throttle_callback(provider, e)
return return
@ -283,7 +282,7 @@ class SZProviderPool(ProviderPool):
logger.debug("RAR Traceback: %s", traceback.format_exc()) logger.debug("RAR Traceback: %s", traceback.format_exc())
return False return False
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e: except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
self.throttle_callback(subtitle.provider_name, e) self.throttle_callback(subtitle.provider_name, e)
self.discarded_providers.add(subtitle.provider_name) self.discarded_providers.add(subtitle.provider_name)
return False return False
@ -648,7 +647,7 @@ def search_external_subtitles(path, languages=None, only_one=False):
abspath = unicode(os.path.abspath( abspath = unicode(os.path.abspath(
os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder, os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder,
video_filename]))) video_filename])))
except Exception, e: except Exception as e:
logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e) logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e)
continue continue
logger.debug("external subs: scanning path %s", abspath) logger.debug("external subs: scanning path %s", abspath)

@ -9,3 +9,13 @@ class TooManyRequests(ProviderError):
class APIThrottled(ProviderError): class APIThrottled(ProviderError):
pass pass
class ServiceUnavailable(ProviderError):
"""Exception raised when status is '503 Service Unavailable'."""
pass
class DownloadLimitExceeded(ProviderError):
"""Exception raised by providers when download limit is exceeded."""
pass

@ -8,7 +8,7 @@ import os
import socket import socket
import logging import logging
import requests import requests
import xmlrpclib import xmlrpc.client
import dns.resolver import dns.resolver
import ipaddress import ipaddress
import re import re
@ -16,7 +16,7 @@ import re
from requests import exceptions from requests import exceptions
from urllib3.util import connection from urllib3.util import connection
from retry.api import retry_call from retry.api import retry_call
from exceptions import APIThrottled from .exceptions import APIThrottled
from dogpile.cache.api import NO_VALUE from dogpile.cache.api import NO_VALUE
from subliminal.cache import region from subliminal.cache import region
from subliminal_patch.pitcher import pitchers from subliminal_patch.pitcher import pitchers
@ -32,10 +32,8 @@ try:
except ImportError: except ImportError:
from urllib.parse import urlparse from urllib.parse import urlparse
from subzero.lib.io import get_viable_encoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where())) pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where()))
try: try:
default_ssl_context = ssl.create_default_context(cafile=pem_file) default_ssl_context = ssl.create_default_context(cafile=pem_file)
except AttributeError: except AttributeError:
@ -99,7 +97,7 @@ class CFSession(CloudScraper):
# Solve Challenge # Solve Challenge
resp = self.sendChallengeResponse(resp, **kwargs) resp = self.sendChallengeResponse(resp, **kwargs)
except ValueError, e: except ValueError as e:
if e.message == "Captcha": if e.message == "Captcha":
parsed_url = urlparse(url) parsed_url = urlparse(url)
domain = parsed_url.netloc domain = parsed_url.netloc
@ -231,7 +229,7 @@ class RetryingCFSession(RetryingSession, CFSession):
pass pass
class SubZeroRequestsTransport(xmlrpclib.SafeTransport): class SubZeroRequestsTransport(xmlrpc.client.SafeTransport):
""" """
Drop in Transport for xmlrpclib that uses Requests instead of httplib Drop in Transport for xmlrpclib that uses Requests instead of httplib

@ -8,7 +8,7 @@ from subliminal.cache import region
from dogpile.cache.api import NO_VALUE from dogpile.cache.api import NO_VALUE
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
Proxy Proxy
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TIMEOUT
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -185,7 +185,7 @@ class DBCProxyLessPitcher(Pitcher):
password = None password = None
def __init__(self, website_name, website_url, website_key, def __init__(self, website_name, website_url, website_key,
timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs): timeout=DEFAULT_TIMEOUT, tries=3, *args, **kwargs):
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries) super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
self.username, self.password = self.client_key.split(":", 1) self.username, self.password = self.client_key.split(":", 1)

@ -5,7 +5,7 @@ import datetime
from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \ from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \
get_series_episode, region, tvdb_client get_series_episode, region, tvdb_client
from util import fix_session_bases from .util import fix_session_bases
TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds() TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds()

@ -272,9 +272,9 @@ class Subtitle(Subtitle_):
def prepare_text(text, style): def prepare_text(text, style):
body = [] body = []
for fragment, sty in parse_tags(text, style, sub.styles): for fragment, sty in parse_tags(text, style, sub.styles):
fragment = fragment.replace(ur"\h", u" ") fragment = fragment.replace(r"\h", u" ")
fragment = fragment.replace(ur"\n", u"\n") fragment = fragment.replace(r"\n", u"\n")
fragment = fragment.replace(ur"\N", u"\n") fragment = fragment.replace(r"\N", u"\n")
if format == "srt": if format == "srt":
if sty.italic: if sty.italic:
fragment = u"<i>%s</i>" % fragment fragment = u"<i>%s</i>" % fragment

@ -1,2 +1,8 @@
import dict, geezip, httpfake, io, json, rar, which from .dict import *
from .geezip import *
from .httpfake import *
from .io import *
from .json import *
from .rar import *
from .which import *

@ -28,7 +28,7 @@ class GeezipFile(gzip.GzipFile):
fileobj.write(self.compress.flush(Z_FINISH)) fileobj.write(self.compress.flush(Z_FINISH))
gzip.write32u(fileobj, self.crc) gzip.write32u(fileobj, self.crc)
# self.size may exceed 2GB, or even 4GB # self.size may exceed 2GB, or even 4GB
gzip.write32u(fileobj, self.size & 0xffffffffL) gzip.write32u(fileobj, self.size & 0xffffffff)
fileobj.flush() fileobj.flush()
finally: finally:
myfileobj = self.myfileobj myfileobj = self.myfileobj

@ -1,5 +1,5 @@
# coding=utf-8 # coding=utf-8
from registry import registry from .registry import registry
from mods import hearing_impaired, ocr_fixes, fps, offset, common, color from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color
from main import SubtitleModifications, SubMod from .main import SubtitleModifications, SubMod

@ -1,3 +1,3 @@
# coding=utf-8 # coding=utf-8
from data import data from .data import data

File diff suppressed because one or more lines are too long

@ -6,14 +6,14 @@ import pysubs2
import logging import logging
import time import time
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from registry import registry from .registry import registry
from subzero.language import Language from subzero.language import Language
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
lowercase_re = re.compile(ur'(?sux)[a-zà-ž]') lowercase_re = re.compile(r'(?sux)[a-zà-ž]')
class SubtitleModifications(object): class SubtitleModifications(object):
@ -143,7 +143,7 @@ class SubtitleModifications(object):
continue continue
# clear empty args # clear empty args
final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems())) final_mod_args = dict(filter(lambda kv: bool(kv[1]), args.iteritems()))
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args) _data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
if _data == mods_merged_log[identifier]["final_identifier"]: if _data == mods_merged_log[identifier]["final_identifier"]:
@ -180,7 +180,7 @@ class SubtitleModifications(object):
entries_used = 0 entries_used = 0
for entry in self.f: for entry in self.f:
entry_used = False entry_used = False
for sub in entry.text.strip().split("\N"): for sub in entry.text.strip().split(r"\N"):
# skip HI bracket entries, those might actually be lowercase # skip HI bracket entries, those might actually be lowercase
sub = sub.strip() sub = sub.strip()
for processor in registry.mods["remove_HI"].processors[:4]: for processor in registry.mods["remove_HI"].processors[:4]:
@ -272,7 +272,7 @@ class SubtitleModifications(object):
continue continue
skip_entry = False skip_entry = False
for line in t.split(ur"\N"): for line in t.split(r"\N"):
# don't bother the mods with surrounding tags # don't bother the mods with surrounding tags
old_line = line old_line = line
line = line.strip() line = line.strip()
@ -377,7 +377,7 @@ class SubtitleModifications(object):
logger.debug(u"%d: %r -> ''", index, entry.text) logger.debug(u"%d: %r -> ''", index, entry.text)
continue continue
new_text = ur"\N".join(lines) new_text = r"\N".join(lines)
# cheap man's approach to avoid open tags # cheap man's approach to avoid open tags
add_start_tags = [] add_start_tags = []

@ -95,7 +95,7 @@ class SubtitleTextModification(SubtitleModification):
pass pass
TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*" TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*"
EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag") EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag")
empty_line_post_processors = [ empty_line_post_processors = [

@ -22,10 +22,10 @@ class CommonFixes(SubtitleTextModification):
processors = [ processors = [
# normalize hyphens # normalize hyphens
NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
# -- = em dash # -- = em dash
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1", name="CM_multidash"), NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1", name="CM_multidash"),
# line = _/-/\s # line = _/-/\s
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"), NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
@ -37,23 +37,23 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
# fix music symbols # fix music symbols
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
lambda x: u"" if x.group(1) else u"", lambda x: u"" if x.group(1) else u"",
name="CM_music_symbols"), name="CM_music_symbols"),
# '' = " # '' = "
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
# double quotes instead of single quotes inside words # double quotes instead of single quotes inside words
NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
# normalize quotes # normalize quotes
NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
name="CM_normalize_quotes"), name="CM_normalize_quotes"),
# normalize single quotes # normalize single quotes
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
# remove leading ... # remove leading ...
NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
@ -89,8 +89,8 @@ class CommonFixes(SubtitleTextModification):
# space before ending doublequote? # space before ending doublequote?
# replace uppercase I with lowercase L in words # replace uppercase I with lowercase L in words
NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
name="CM_uppercase_i_in_word"), name="CM_uppercase_i_in_word"),
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
@ -101,11 +101,11 @@ class CommonFixes(SubtitleTextModification):
name="CM_spaces_in_numbers"), name="CM_spaces_in_numbers"),
# uppercase after dot # uppercase after dot
NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
# remove double interpunction # remove double interpunction
NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
name="CM_double_interpunct"), name="CM_double_interpunct"),
@ -149,14 +149,14 @@ class ReverseRTL(SubtitleModification):
processors = [ processors = [
# new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2 # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
#NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
# name="CM_RTL_reverse") # name="CM_RTL_reverse")
NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
name="CM_RTL_reverse") name="CM_RTL_reverse")
] ]
split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)") split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")
class FixUppercase(SubtitleModification): class FixUppercase(SubtitleModification):

@ -26,71 +26,71 @@ class HearingImpaired(SubtitleTextModification):
processors = [ processors = [
# full bracket entry, single or multiline; starting with brackets and ending with brackets # full bracket entry, single or multiline; starting with brackets and ending with brackets
FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
"", name="HI_brackets_full"), "", name="HI_brackets_full"),
# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
# possibly with a dash in front; ignore anything ending with a quote # possibly with a dash in front; ignore anything ending with a quote
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
name="HI_before_colon_caps"), name="HI_before_colon_caps"),
# any text before colon (at least 3 chars); at start or after a sentence, # any text before colon (at least 3 chars); at start or after a sentence,
# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
# a space is inside the text; ignore anything ending with a quote # a space is inside the text; ignore anything ending with a quote
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
lambda match: lambda match:
match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
else "" if not match.group(1).startswith(" ") else " ", else "" if not match.group(1).startswith(" ") else " ",
name="HI_before_colon_noncaps"), name="HI_before_colon_noncaps"),
# brackets (only remove if at least 3 chars in brackets) # brackets (only remove if at least 3 chars in brackets)
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
{"t": TAG}), "", name="HI_brackets"), {"t": TAG}), "", name="HI_brackets"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
# "", name="HI_bracket_open_start"), # "", name="HI_bracket_open_start"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
# name="HI_bracket_open_end"), # name="HI_bracket_open_end"),
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any) # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"), # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
# starting text before colon (at least 3 chars) # starting text before colon (at least 3 chars)
#NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
# name="HI_before_colon"), # name="HI_before_colon"),
# text in brackets at start, after optional dash, before colon or at end of line # text in brackets at start, after optional dash, before colon or at end of line
# fixme: may be too aggressive # fixme: may be too aggressive
#NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
# name="HI_brackets_special"), # name="HI_brackets_special"),
# all caps line (at least 4 consecutive uppercase chars) # all caps line (at least 4 consecutive uppercase chars)
NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
supported=lambda p: not p.only_uppercase), supported=lambda p: not p.only_uppercase),
# remove MAN: # remove MAN:
NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
# dash in front # dash in front
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"), # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
# all caps at start before new sentence # all caps at start before new sentence
NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase), name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
] ]
post_processors = empty_line_post_processors post_processors = empty_line_post_processors
last_processors = [ last_processors = [
# remove music symbols # remove music symbols
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
"", name="HI_music_symbols_only"), "", name="HI_music_symbols_only"),
# remove music entries # remove music entries
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'), NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
"", name="HI_music"), "", name="HI_music"),
] ]

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save