Initial commit.

pull/292/head
Louis Vézina 6 years ago
parent b9a987b57d
commit d61bdfcd4f

@ -3,6 +3,7 @@ from get_argv import config_dir
import sqlite3
import os
from subliminal import provider_manager
from subliminal_patch import provider_manager
import collections
def load_providers():

@ -10,7 +10,9 @@ import time
from datetime import datetime, timedelta
from babelfish import Language
from subliminal import region, scan_video, Video, download_best_subtitles, compute_score, save_subtitles, AsyncProviderPool, score, list_subtitles, download_subtitles
from subliminal_patch import region, scan_video, Video, download_best_subtitles, compute_score, save_subtitles, AsyncProviderPool, score, list_subtitles, download_subtitles
from subliminal.subtitle import get_subtitle_path
from subliminal_patch.subtitle import get_subtitle_path
from get_languages import language_from_alpha3, alpha2_from_alpha3, alpha3_from_alpha2
from bs4 import UnicodeDammit
from get_settings import get_general_settings, pp_replace, path_replace, path_replace_movie, path_replace_reverse, path_replace_reverse_movie

@ -6,6 +6,7 @@ import enzyme
import babelfish
import logging
from subliminal import core
from subliminal_patch import core
import sqlite3
import ast
import langdetect

@ -21,15 +21,14 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.3"
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
__version__ = "4.6.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
import sys
import traceback
import warnings
@ -83,46 +82,14 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A specific TreeBuilder to use instead of looking one
up based on `features`. You shouldn't need to use this.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 and there's no need to actually pass keyword
arguments into the constructor.
"""
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
@ -204,35 +171,14 @@ class BeautifulSoup(Tag):
else:
markup_type = "HTML"
# This code adapted from warnings.py so that we get the same line
# of code as our warnings.warn() call gets, even if the answer is wrong
# (as it may be in a multithreading situation).
caller = None
try:
caller = sys._getframe(1)
except ValueError:
pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type))
self.builder = builder
self.is_xml = builder.is_xml
@ -356,10 +302,9 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""

@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
@ -125,7 +125,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
@ -235,17 +235,11 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from earlier versions of HTML and are removed in HTML5.
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
# These are from HTML4, removed in HTML5.
'spacer', 'frame'
])
# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,

@ -1,4 +1,3 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
@ -65,18 +64,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@ -141,26 +129,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
data = None
if real_name < 256:
# HTML numeric entities are supposed to reference Unicode
# code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError, e:
pass
if not data:
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
pass
data = data or u"\N{REPLACEMENT CHARACTER}"
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@ -168,12 +141,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
# If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
@ -245,7 +213,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))

@ -5,13 +5,9 @@ __all__ = [
'LXMLTreeBuilder',
]
try:
from collections.abc import Callable # Python 3.6
except ImportError , e:
from collections import Callable
from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import (
Comment,
@ -62,7 +58,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, Callable):
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
@ -151,11 +147,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())

@ -46,9 +46,9 @@ except ImportError:
pass
xml_encoding_re = re.compile(
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@ -82,7 +82,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")

@ -37,7 +37,7 @@ def diagnose(data):
name)
if 'lxml' in basic_parsers:
basic_parsers.append("lxml-xml")
basic_parsers.append(["lxml", "xml"])
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@ -56,27 +56,21 @@ def diagnose(data):
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
else:
try:
if os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print
print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, features=parser)
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser

@ -2,10 +2,7 @@
# found in the LICENSE file.
__license__ = "MIT"
try:
from collections.abc import Callable # Python 3.6
except ImportError , e:
from collections import Callable
import collections
import re
import shlex
import sys
@ -15,7 +12,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
whitespace_re = re.compile(r"\s+")
whitespace_re = re.compile("\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@ -72,7 +69,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
@ -126,41 +123,6 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
class Formatter(object):
"""Contains information about how to format a parse tree."""
# By default, represent void elements as <tag/> rather than <tag>
void_element_close_prefix = '/'
def substitute_entities(self, *args, **kwargs):
"""Transform certain characters into named entities."""
raise NotImplementedError()
class HTMLFormatter(Formatter):
"""The default HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class MinimalHTMLFormatter(Formatter):
"""A minimal HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
class HTML5Formatter(HTMLFormatter):
"""An HTML formatter that omits the slash in a void tag."""
void_element_close_prefix = None
class XMLFormatter(Formatter):
"""Substitute only the essential XML entities."""
def substitute(self, *args, **kwargs):
return EntitySubstitution.substitute_xml(*args, **kwargs)
class HTMLXMLFormatter(Formatter):
"""Format XML using HTML rules."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@ -169,49 +131,40 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
# are converted to those entities on output.
# "html5" - The same as "html", but empty void tags are represented as
# <tag> rather than <tag/>
# "minimal" - Bare ampersands and angle brackets are converted to
# are converted to those entities on output.
# "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A callable function - it will be called on every string that needs to undergo entity substitution.
# A Formatter instance - Formatter.substitute(string) will be called on every string that
# A function - This function will be called on every string that
# needs to undergo entity substitution.
#
# In an HTML document, the default "html", "html5", and "minimal"
# functions will leave the contents of <script> and <style> tags
# alone. For an XML document, all tags will be given the same
# treatment.
# In an HTML document, the default "html" and "minimal" functions
# will leave the contents of <script> and <style> tags alone. For
# an XML document, all tags will be given the same treatment.
HTML_FORMATTERS = {
"html" : HTMLFormatter(),
"html5" : HTML5Formatter(),
"minimal" : MinimalHTMLFormatter(),
"html" : HTMLAwareEntitySubstitution.substitute_html,
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
None : None
}
XML_FORMATTERS = {
"html" : HTMLXMLFormatter(),
"minimal" : XMLFormatter(),
"html" : EntitySubstitution.substitute_html,
"minimal" : EntitySubstitution.substitute_xml,
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if isinstance(formatter, basestring):
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
if callable(formatter):
# Backwards compatibility -- you used to pass in a formatting method.
output = formatter(s)
else:
output = formatter.substitute(s)
output = formatter(s)
return output
@property
@ -241,9 +194,11 @@ class PageElement(object):
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
return self.XML_FORMATTERS.get(name, XMLFormatter())
return self.XML_FORMATTERS.get(
name, EntitySubstitution.substitute_xml)
else:
return self.HTML_FORMATTERS.get(name, HTMLFormatter())
return self.HTML_FORMATTERS.get(
name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
@ -361,14 +316,6 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
from bs4 import BeautifulSoup
if isinstance(new_child, BeautifulSoup):
# We don't want to end up with a situation where one BeautifulSoup
# object contains another. Insert the children one at a time.
for subchild in list(new_child.contents):
self.insert(position, subchild)
position += 1
return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@ -589,21 +536,14 @@ class PageElement(object):
elif isinstance(name, basestring):
# Optimization to find all tags with a given name.
if name.count(':') == 1:
# This is a name with a prefix. If this is a namespace-aware document,
# we need to match the local name against tag.name. If not,
# we need to match the fully-qualified name against tag.name.
prefix, local_name = name.split(':', 1)
# This is a name with a prefix.
prefix, name = name.split(':', 1)
else:
prefix = None
local_name = name
result = (element for element in generator
if isinstance(element, Tag)
and (
element.name == name
) or (
element.name == local_name
and (prefix is None or element.prefix == prefix)
)
and element.name == name
and (prefix is None or element.prefix == prefix)
)
return ResultSet(strainer, result)
results = ResultSet(strainer)
@ -922,7 +862,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@ -1106,10 +1046,8 @@ class Tag(PageElement):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
)
)
'.%sTag is deprecated, use .find("%s") instead.' % (
tag_name, tag_name))
return self.find(tag_name)
# We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents":
@ -1191,10 +1129,11 @@ class Tag(PageElement):
encoding.
"""
# First off, turn a string formatter into a Formatter object. This
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not callable(formatter):
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@ -1223,9 +1162,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
close = ''
if isinstance(formatter, Formatter):
close = formatter.void_element_close_prefix or close
close = '/'
else:
closeTag = '</%s%s>' % (prefix, self.name)
@ -1296,9 +1233,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
# First off, turn a string formatter into a Formatter object. This
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not callable(formatter):
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
@ -1411,29 +1348,15 @@ class Tag(PageElement):
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
selectors = [x.strip() for x in selector.split(",")]
# If a selector is mentioned multiple times we don't want
# to use it more than once.
used_selectors = set()
# We also don't want to select the same element more than once,
# if it's matched by multiple selectors.
selected_object_ids = set()
for partial_selector in selectors:
for partial_selector in selector.split(','):
partial_selector = partial_selector.strip()
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
if partial_selector in used_selectors:
continue
used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
# This lets us distinguish between distinct tags that
# represent the same markup.
object_id = id(candidate)
if object_id not in selected_object_ids:
if candidate not in context:
context.append(candidate)
selected_object_ids.add(object_id)
if limit and len(context) >= limit:
break
return context
@ -1495,7 +1418,7 @@ class Tag(PageElement):
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
@ -1729,7 +1652,7 @@ class SoupStrainer(object):
markup = markup_name
markup_attrs = markup
call_function_with_tag_data = (
isinstance(self.name, Callable)
isinstance(self.name, collections.Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@ -1809,7 +1732,7 @@ class SoupStrainer(object):
# True matches any non-None value.
return markup is not None
if isinstance(match_against, Callable):
if isinstance(match_against, collections.Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all

@ -1,4 +1,3 @@
# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
@ -151,14 +150,6 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_namespaced_html(self):
"""When a namespaced XML document is parsed as HTML it should
be treated as HTML with weird tag names.
"""
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
soup = self.soup(markup)
self.assertEqual(2, len(soup.find_all("ns1:foo")))
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
@ -320,26 +311,6 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_strings_resembling_character_entity_references(self):
# "&T" and "&p" look like incomplete character entities, but they are
# not.
self.assertSoupEquals(
u"<p>&bull; AT&T is in the s&p 500</p>",
u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
def test_entities_in_foreign_document_encoding(self):
# &#147; and &#148; are invalid numeric entities referencing
# Windows-1252 characters. &#45; references a character common
# to Windows-1252 and Unicode, and &#9731; references a
# character only found in Unicode.
#
# All of these entities should be converted to Unicode
# characters.
markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
soup = self.soup(markup)
self.assertEquals(u"“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
@ -363,7 +334,7 @@ Hello, world!
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -653,17 +624,6 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<parent xmlns="http://ns1/">
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
</child>
</parent>"""
soup = self.soup(doc)
self.assertEqual(doc, soup.encode())
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">

@ -5,7 +5,6 @@ from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@ -33,17 +32,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")

@ -46,12 +46,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
# a string like u'\x93'
pass
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.

@ -605,7 +605,7 @@ class SiblingTest(TreeTest):
</html>'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
markup = re.compile(r"\n\s*").sub("", markup)
markup = re.compile("\n\s*").sub("", markup)
self.tree = self.soup(markup)
@ -703,12 +703,12 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
new_tag = soup.new_tag("foo", bar="baz")
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
self.assertEqual(dict(bar="baz"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
@ -821,26 +821,6 @@ class TestTreeModification(SoupTest):
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
def test_insert_beautifulsoup_object_inserts_children(self):
"""Inserting one BeautifulSoup object into another actually inserts all
of its children -- you'll never combine BeautifulSoup objects.
"""
soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
text = "<p>p2</p><p>p3</p>"
to_insert = self.soup(text)
soup.insert(1, to_insert)
for i in soup.descendants:
assert not isinstance(i, BeautifulSoup)
p1, p2, p3, p4 = list(soup.children)
self.assertEquals("And now, a word:", p1.string)
self.assertEquals("p2", p2.string)
self.assertEquals("p3", p3.string)
self.assertEquals("And we're back.", p4.string)
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('<p><a>one</a><b>three</b></p>')
a = soup.a
@ -1206,7 +1186,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
'.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
'.bTag is deprecated, use .find("b") instead.',
str(w[0].message))
def test_has_attr(self):
@ -1439,21 +1419,13 @@ class TestSubstitutions(SoupTest):
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self):
markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html")
self.assertEqual(
decoded,
self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_html5(self):
markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html5")
self.assertEqual(
decoded,
self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
@ -1474,14 +1446,14 @@ class TestSubstitutions(SoupTest):
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self):
markup = u"<b>&lt;foo&gt;</b><b>bar</b><br/>"
markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom
# callable is called on every string.
self.assertEqual(
decoded,
self.document_for(u"<b><FOO></b><b>BAR</b><br>"))
self.document_for(u"<b><FOO></b><b>BAR</b>"))
def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
@ -1526,7 +1498,7 @@ class TestSubstitutions(SoupTest):
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify())
def test_prettify_accepts_formatter_function(self):
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
@ -2074,17 +2046,5 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
def test_select_duplicate_elements(self):
# When markup contains duplicate elements, a multiple select
# will find all of them.
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
soup = BeautifulSoup(markup, 'html.parser')
selected = soup.select(".c1, .c2")
self.assertEquals(3, len(selected))
# Verify that find_all finds the same elements, though because
# of an implementation detail it finds them in a different
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected

@ -1,6 +1,21 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import platform
is_windows_special_path = False
if platform.system() == "Windows":
try:
__file__.decode("ascii")
except UnicodeDecodeError:
is_windows_special_path = True
if not is_windows_special_path:
from concurrent.futures import ThreadPoolExecutor
else:
ThreadPoolExecutor = object
from datetime import datetime
import io
import itertools
@ -388,7 +403,7 @@ def search_external_subtitles(path, directory=None):
subtitles = {}
for p in os.listdir(directory or dirpath):
# keep only valid subtitle filenames
if not p.startswith(fileroot) or not p.lower().endswith(SUBTITLE_EXTENSIONS):
if not p.startswith(fileroot) or not p.endswith(SUBTITLE_EXTENSIONS):
continue
# extract the potential language code
@ -420,7 +435,7 @@ def scan_video(path):
raise ValueError('Path does not exist')
# check video extension
if not path.lower().endswith(VIDEO_EXTENSIONS):
if not path.endswith(VIDEO_EXTENSIONS):
raise ValueError('%r is not a valid video extension' % os.path.splitext(path)[1])
dirpath, filename = os.path.split(path)
@ -468,7 +483,7 @@ def scan_archive(path):
rar = RarFile(path)
# filter on video extensions
rar_filenames = [f for f in rar.namelist() if f.lower().endswith(VIDEO_EXTENSIONS)]
rar_filenames = [f for f in rar.namelist() if f.endswith(VIDEO_EXTENSIONS)]
# no video found
if not rar_filenames:
@ -521,26 +536,17 @@ def scan_videos(path, age=None, archives=True):
if dirname.startswith('.'):
logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath)
dirnames.remove(dirname)
# Skip Sample folder
if dirname.lower() == 'sample':
logger.debug('Skipping sample dirname %r in %r', dirname, dirpath)
dirnames.remove(dirname)
# scan for videos
for filename in filenames:
# filter on videos and archives
if not (filename.lower().endswith(VIDEO_EXTENSIONS) or
archives and filename.lower().endswith(ARCHIVE_EXTENSIONS)):
if not (filename.endswith(VIDEO_EXTENSIONS) or archives and filename.endswith(ARCHIVE_EXTENSIONS)):
continue
# skip hidden files
if filename.startswith('.'):
logger.debug('Skipping hidden filename %r in %r', filename, dirpath)
continue
# skip 'sample' media files
if os.path.splitext(filename)[0].lower() == 'sample':
logger.debug('Skipping sample filename %r in %r', filename, dirpath)
continue
# reconstruct the file path
filepath = os.path.join(dirpath, filename)
@ -562,13 +568,13 @@ def scan_videos(path, age=None, archives=True):
continue
# scan
if filename.lower().endswith(VIDEO_EXTENSIONS): # video
if filename.endswith(VIDEO_EXTENSIONS): # video
try:
video = scan_video(filepath)
except ValueError: # pragma: no cover
logger.exception('Error scanning video')
continue
elif archives and filename.lower().endswith(ARCHIVE_EXTENSIONS): # archive
elif archives and filename.endswith(ARCHIVE_EXTENSIONS): # archive
try:
video = scan_archive(filepath)
except (NotRarFile, RarCannotExec, ValueError): # pragma: no cover

@ -179,7 +179,7 @@ class Addic7edProvider(Provider):
# make the search
logger.info('Searching show ids with %r', params)
r = self.session.get(self.server_url + 'srch.php', params=params, timeout=10)
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

@ -170,7 +170,7 @@ class LegendasTVProvider(Provider):
# Provider needs UNRAR installed. If not available raise ConfigurationError
try:
rarfile.custom_check([rarfile.UNRAR_TOOL], True)
rarfile.custom_check(rarfile.UNRAR_TOOL)
except rarfile.RarExecError:
raise ConfigurationError('UNRAR tool not available')

@ -3,7 +3,7 @@ from datetime import datetime, timedelta
from functools import wraps
import logging
import re
import _strptime
import requests
from .. import __short_version__

@ -6,8 +6,6 @@ import os
import chardet
import pysrt
import types
from .score import get_equivalent_release_groups
from .video import Episode, Movie
from .utils import sanitize, sanitize_release_group
@ -232,39 +230,16 @@ def guess_matches(video, guess, partial=False):
if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title):
matches.add('title')
# release_group
if 'release_group' in guess:
release_groups = guess["release_group"]
if not isinstance(release_groups, types.ListType):
release_groups = [release_groups]
if video.release_group:
for release_group in release_groups:
if (sanitize_release_group(release_group) in
get_equivalent_release_groups(sanitize_release_group(video.release_group))):
matches.add('release_group')
break
if (video.release_group and 'release_group' in guess and
sanitize_release_group(guess['release_group']) in
get_equivalent_release_groups(sanitize_release_group(video.release_group))):
matches.add('release_group')
# resolution
if video.resolution and 'screen_size' in guess and guess['screen_size'] == video.resolution:
matches.add('resolution')
# format
if 'format' in guess:
formats = guess["format"]
if not isinstance(formats, types.ListType):
formats = [formats]
if video.format:
video_format = video.format
if video_format in ("HDTV", "SDTV", "TV"):
video_format = "TV"
logger.debug("Treating HDTV/SDTV the same")
for frmt in formats:
if frmt in ("HDTV", "SDTV"):
frmt = "TV"
if frmt.lower() == video_format.lower():
matches.add('format')
break
if video.format and 'format' in guess and guess['format'].lower() == video.format.lower():
matches.add('format')
# video_codec
if video.video_codec and 'video_codec' in guess and guess['video_codec'] == video.video_codec:
matches.add('video_codec')

Loading…
Cancel
Save