")
- def test_multivalued_attribute_with_whitespace(self):
- # Whitespace separating the values of a multi-valued attribute
- # should be ignored.
-
- markup = '
'
- soup = self.soup(markup)
- self.assertEqual(['foo', 'bar'], soup.div['class'])
-
- # If you search by the literal name of the class it's like the whitespace
- # wasn't there.
- self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
-
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
@@ -483,41 +311,15 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('
', '
')
- def test_strings_resembling_character_entity_references(self):
- # "&T" and "&p" look like incomplete character entities, but they are
- # not.
- self.assertSoupEquals(
- "
• AT&T is in the s&p 500
",
- "
\u2022 AT&T is in the s&p 500
"
- )
-
- def test_apos_entity(self):
- self.assertSoupEquals(
- "
Bob's Bar
",
- "
Bob's Bar
",
- )
-
- def test_entities_in_foreign_document_encoding(self):
- # and are invalid numeric entities referencing
- # Windows-1252 characters. - references a character common
- # to Windows-1252 and Unicode, and ☃ references a
- # character only found in Unicode.
- #
- # All of these entities should be converted to Unicode
- # characters.
- markup = "
Hello -☃
"
- soup = self.soup(markup)
- self.assertEqual("“Hello†-☃", soup.p.string)
-
def test_entities_in_attributes_converted_to_unicode(self):
- expect = '
'
+ expect = u'
'
self.assertSoupEquals('
', expect)
self.assertSoupEquals('
', expect)
self.assertSoupEquals('
', expect)
self.assertSoupEquals('
', expect)
def test_entities_in_text_converted_to_unicode(self):
- expect = '
pi\N{LATIN SMALL LETTER N WITH TILDE}ata
'
+ expect = u'
pi\N{LATIN SMALL LETTER N WITH TILDE}ata
'
self.assertSoupEquals("
piñata
", expect)
self.assertSoupEquals("
piñata
", expect)
self.assertSoupEquals("
piñata
", expect)
@@ -528,11 +330,11 @@ Hello, world!
'
I said "good day!"
')
def test_out_of_range_entity(self):
- expect = "\N{REPLACEMENT CHARACTER}"
+ expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
-
+
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("
\nfoo
")
@@ -606,9 +408,9 @@ Hello, world!
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
- markup = '
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
+ markup = u'
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
soup = self.soup(markup)
- self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
+ self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@@ -648,7 +450,7 @@ Hello, world!
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "
<<sacré bleu!>>
"
- expected = "
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
"
+ expected = u"
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@@ -658,15 +460,15 @@ Hello, world!
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
- "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+ u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("
")
- self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
+ self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "
<<sacré bleu!>>
"
- expected = "
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8")
+ expected = u"
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@@ -675,7 +477,7 @@ Hello, world!
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = '
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
+ unicode_html = u'
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@@ -784,13 +586,6 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('
text', data.a.decode())
- def test_worst_case(self):
- """Test the worst case (currently) for linking issues."""
-
- soup = self.soup(BAD_DOCUMENT)
- self.linkage_validator(soup)
-
-
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
@@ -829,17 +624,6 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
- def test_nested_namespaces(self):
- doc = b"""
-
-
-
-
-
-"""
- soup = self.soup(doc)
- self.assertEqual(doc, soup.encode())
-
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
hello
-"""
- soup = self.soup(markup)
- [s.extract() for s in soup('script')]
- [s.extract() for s in soup('style')]
-
- self.assertEqual(len(soup.find_all("p")), 1)
-
- def test_empty_comment(self):
- """
- Test that empty comment does not break structure.
-
- https://bugs.launchpad.net/beautifulsoup/+bug/1806598
- """
-
- markup = """
-
-
-
-
-
-"""
- soup = self.soup(markup)
- inputs = []
- for form in soup.find_all('form'):
- inputs.extend(form.find_all('input'))
- self.assertEqual(len(inputs), 1)
+ self.assertEqual(u"A
", soup.body.decode())
diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py
index 790489aa1..d5cf0253f 100644
--- a/libs/bs4/tests/test_htmlparser.py
+++ b/libs/bs4/tests/test_htmlparser.py
@@ -5,11 +5,12 @@ from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
-from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
- default_builder = HTMLParserTreeBuilder
+ @property
+ def default_builder(self):
+ return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
@@ -31,17 +32,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('
', "
")
self.assertSoupEquals('', "")
-
- def test_empty_element(self):
- # This verifies that any buffered data present when the parser
- # finishes working is handled.
- self.assertSoupEquals("foo bar", "foo &# bar")
-
-
-class TestHTMLParserSubclass(SoupTest):
- def test_error(self):
- """Verify that our HTMLParser subclass implements error() in a way
- that doesn't cause a crash.
- """
- parser = BeautifulSoupHTMLParser()
- parser.error("don't crash")
diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py
index 29da71149..a05870b91 100644
--- a/libs/bs4/tests/test_lxml.py
+++ b/libs/bs4/tests/test_lxml.py
@@ -7,7 +7,7 @@ try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
-except ImportError as e:
+except ImportError, e:
LXML_PRESENT = False
LXML_VERSION = (0,)
@@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilder
+ return LXMLTreeBuilder()
def test_out_of_range_entity(self):
self.assertSoupEquals(
@@ -46,12 +46,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"
foobar
", "
foobar
")
- def test_entities_in_foreign_document_encoding(self):
- # We can't implement this case correctly because by the time we
- # hear about markup like "", it's been (incorrectly) converted into
- # a string like u'\x93'
- pass
-
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@@ -68,7 +62,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("
")
- self.assertEqual("
", str(soup.b))
+ self.assertEqual(u"
", unicode(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf(
@@ -79,22 +73,4 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilderForXML
-
- def test_namespace_indexing(self):
- # We should not track un-prefixed namespaces as we can only hold one
- # and it will be recognized as the default namespace by soupsieve,
- # which may be confusing in some situations. When no namespace is provided
- # for a selector, the default namespace (if defined) is assumed.
-
- soup = self.soup(
- '\n'
- '
'
- 'content'
- 'content'
- ''
- )
- self.assertEqual(
- soup._namespaces,
- {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
- )
+ return LXMLTreeBuilderForXML()
diff --git a/libs/bs4/tests/test_soup.py b/libs/bs4/tests/test_soup.py
index 1eda9484b..f3e69edf3 100644
--- a/libs/bs4/tests/test_soup.py
+++ b/libs/bs4/tests/test_soup.py
@@ -24,7 +24,6 @@ from bs4.dammit import (
EncodingDetector,
)
from bs4.testing import (
- default_builder,
SoupTest,
skipIf,
)
@@ -33,7 +32,7 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
-except ImportError as e:
+except ImportError, e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@@ -41,86 +40,21 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
- data = "
éé
"
+ data = u"
éé
"
soup = self.soup(data)
- self.assertEqual("éé", soup.h1.string)
+ self.assertEqual(u"éé", soup.h1.string)
def test_embedded_null(self):
- data = "
foo\0bar
"
+ data = u"
foo\0bar
"
soup = self.soup(data)
- self.assertEqual("foo\0bar", soup.h1.string)
+ self.assertEqual(u"foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
- utf8_data = "Räksmörgås".encode("utf-8")
+ utf8_data = u"Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
- def test_custom_builder_class(self):
- # Verify that you can pass in a custom Builder class and
- # it'll be instantiated with the appropriate keyword arguments.
- class Mock(object):
- def __init__(self, **kwargs):
- self.called_with = kwargs
- self.is_xml = True
- def initialize_soup(self, soup):
- pass
- def prepare_markup(self, *args, **kwargs):
- return ''
-
- kwargs = dict(
- var="value",
- # This is a deprecated BS3-era keyword argument, which
- # will be stripped out.
- convertEntities=True,
- )
- with warnings.catch_warnings(record=True):
- soup = BeautifulSoup('', builder=Mock, **kwargs)
- assert isinstance(soup.builder, Mock)
- self.assertEqual(dict(var="value"), soup.builder.called_with)
-
- # You can also instantiate the TreeBuilder yourself. In this
- # case, that specific object is used and any keyword arguments
- # to the BeautifulSoup constructor are ignored.
- builder = Mock(**kwargs)
- with warnings.catch_warnings(record=True) as w:
- soup = BeautifulSoup(
- '', builder=builder, ignored_value=True,
- )
- msg = str(w[0].message)
- assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
- self.assertEqual(builder, soup.builder)
- self.assertEqual(kwargs, builder.called_with)
-
- def test_cdata_list_attributes(self):
- # Most attribute values are represented as scalars, but the
- # HTML standard says that some attributes, like 'class' have
- # space-separated lists as values.
- markup = '
'
- soup = self.soup(markup)
-
- # Note that the spaces are stripped for 'class' but not for 'id'.
- a = soup.a
- self.assertEqual(" an id ", a['id'])
- self.assertEqual(["a", "class"], a['class'])
-
- # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
- # you customize or disable this. As always, you can customize the TreeBuilder
- # by passing in a keyword argument to the BeautifulSoup constructor.
- soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
- self.assertEqual(" a class ", soup.a['class'])
-
- # Here are two ways of saying that `id` is a multi-valued
- # attribute in this context, but 'class' is not.
- for switcheroo in ({'*': 'id'}, {'a': 'id'}):
- with warnings.catch_warnings(record=True) as w:
- # This will create a warning about not explicitly
- # specifying a parser, but we'll ignore it.
- soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
- a = soup.a
- self.assertEqual(["an", "id"], a['id'])
- self.assertEqual(" a class ", a['class'])
-
-
+
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
@@ -195,7 +129,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
- soup = self.soup("http://www.crummyunicode.com/")
+ soup = self.soup(u"http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
@@ -207,7 +141,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
- soup = self.soup("http://www.crummyuncode.com/ is great")
+ soup = self.soup(u"http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
@@ -229,9 +163,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
- s = "foo\u2200\N{SNOWMAN}\u00f5bar"
+ s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
- "foo∀\N{SNOWMAN}õbar")
+ u"foo∀\N{SNOWMAN}õbar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
@@ -283,7 +217,7 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(
self.sub.substitute_xml_containing_entities("ÁT&T"),
"ÁT&T")
-
+
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
@@ -296,7 +230,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self):
super(TestEncodingConversion, self).setUp()
- self.unicode_data = '
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
+ self.unicode_data = u'
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
@@ -316,7 +250,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"
a"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
- self.assertTrue(isinstance(unicode_output, str))
+ self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
@@ -328,7 +262,7 @@ class TestEncodingConversion(SoupTest):
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
- self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
@@ -336,7 +270,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
- self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
@@ -347,14 +281,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
- markup = '
'
+ markup = u'
'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
- markup = "I'm already Unicode! \N{SNOWMAN}"
+ markup = u"I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
@@ -362,7 +296,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"
\x91\x92\x93\x94"
dammit = UnicodeDammit(markup)
self.assertEqual(
- dammit.unicode_markup, "
\u2018\u2019\u201c\u201d")
+ dammit.unicode_markup, u"
\u2018\u2019\u201c\u201d")
def test_smart_quotes_to_xml_entities(self):
markup = b"
\x91\x92\x93\x94"
@@ -386,14 +320,14 @@ class TestUnicodeDammit(unittest.TestCase):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
- self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
+ self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
- self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
+ self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -402,19 +336,19 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
- utf8_data = "Räksmörgås".encode("utf-8")
+ utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
- utf8_data = "Räksmörgås".encode("utf-8")
+ utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
- utf8_data = "Räksmörgås".encode("utf-8")
+ utf8_data = u"Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
@@ -430,7 +364,7 @@ class TestUnicodeDammit(unittest.TestCase):
detected = EncodingDetector(
b'')
encodings = list(detected.encodings)
- assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
+ assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
@@ -470,7 +404,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue("\ufffd" in dammit.unicode_markup)
+ self.assertTrue(u"\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
@@ -482,17 +416,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
- self.assertEqual("
áé", dammit.unicode_markup)
+ self.assertEqual(u"
áé", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
- utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
+ utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
- "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
- "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+ u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+ u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
@@ -507,7 +441,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
- "☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8"))
+ u"☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
@@ -515,9 +449,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
- "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
- "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
- "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+ u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+ u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+ u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py
index 3b4beeb8f..c0e7c4080 100644
--- a/libs/bs4/tests/test_tree.py
+++ b/libs/bs4/tests/test_tree.py
@@ -1,3 +1,4 @@
+
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@@ -25,7 +26,6 @@ from bs4.element import (
Comment,
Declaration,
Doctype,
- Formatter,
NavigableString,
SoupStrainer,
Tag,
@@ -71,13 +71,13 @@ class TestFind(TreeTest):
self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self):
- soup = self.soup('
Räksmörgås
')
- self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
+ soup = self.soup(u'
Räksmörgås
')
+ self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
def test_unicode_attribute_find(self):
- soup = self.soup('
here it is
')
+ soup = self.soup(u'
here it is
')
str(soup)
- self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
+ self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
def test_find_everything(self):
@@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
"""You can search the tree for text nodes."""
soup = self.soup("Foo
bar\xbb")
# Exact match.
- self.assertEqual(soup.find_all(string="bar"), ["bar"])
- self.assertEqual(soup.find_all(text="bar"), ["bar"])
+ self.assertEqual(soup.find_all(string="bar"), [u"bar"])
+ self.assertEqual(soup.find_all(text="bar"), [u"bar"])
# Match any of a number of strings.
self.assertEqual(
- soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
+ soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
# Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')),
- ["Foo", "bar", '\xbb'])
+ [u"Foo", u"bar", u'\xbb'])
# Match anything.
self.assertEqual(soup.find_all(text=True),
- ["Foo", "bar", '\xbb'])
+ [u"Foo", u"bar", u'\xbb'])
def test_find_all_limit(self):
"""You can limit the number of items returned by find_all."""
@@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self):
- peace = "×ולש".encode("utf8")
- data = '
'.encode("utf8")
+ peace = u"×ולש".encode("utf8")
+ data = u'
'.encode("utf8")
soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@@ -417,48 +417,6 @@ class TestFindAllByAttribute(TreeTest):
self.assertEqual([], soup.find_all(id=1, text="bar"))
-class TestSmooth(TreeTest):
- """Test Tag.smooth."""
-
- def test_smooth(self):
- soup = self.soup("
a
")
- div = soup.div
- div.append("b")
- div.append("c")
- div.append(Comment("Comment 1"))
- div.append(Comment("Comment 2"))
- div.append("d")
- builder = self.default_builder()
- span = Tag(soup, builder, 'span')
- span.append('1')
- span.append('2')
- div.append(span)
-
- # At this point the tree has a bunch of adjacent
- # NavigableStrings. This is normal, but it has no meaning in
- # terms of HTML, so we may want to smooth things out for
- # output.
-
- # Since the
tag has two children, its .string is None.
- self.assertEqual(None, div.span.string)
-
- self.assertEqual(7, len(div.contents))
- div.smooth()
- self.assertEqual(5, len(div.contents))
-
- # The three strings at the beginning of div.contents have been
- # merged into on string.
- #
- self.assertEqual('abc', div.contents[0])
-
- # The call is recursive -- the tag was also smoothed.
- self.assertEqual('12', div.span.string)
-
- # The two comments have _not_ been merged, even though
- # comments are strings. Merging comments would change the
- # meaning of the HTML.
- self.assertEqual('Comment 1', div.contents[1])
- self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest):
@@ -647,7 +605,7 @@ class SiblingTest(TreeTest):
'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
- markup = re.compile(r"\n\s*").sub("", markup)
+ markup = re.compile("\n\s*").sub("", markup)
self.tree = self.soup(markup)
@@ -745,12 +703,12 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
- new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
+ new_tag = soup.new_tag("foo", bar="baz")
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
- self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
+ self.assertEqual(dict(bar="baz"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
-
+
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
@@ -863,26 +821,6 @@ class TestTreeModification(SoupTest):
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
- def test_insert_beautifulsoup_object_inserts_children(self):
- """Inserting one BeautifulSoup object into another actually inserts all
- of its children -- you'll never combine BeautifulSoup objects.
- """
- soup = self.soup("And now, a word:
And we're back.
")
-
- text = "p2
p3
"
- to_insert = self.soup(text)
- soup.insert(1, to_insert)
-
- for i in soup.descendants:
- assert not isinstance(i, BeautifulSoup)
-
- p1, p2, p3, p4 = list(soup.children)
- self.assertEqual("And now, a word:", p1.string)
- self.assertEqual("p2", p2.string)
- self.assertEqual("p3", p3.string)
- self.assertEqual("And we're back.", p4.string)
-
-
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('onethree
')
a = soup.a
@@ -939,7 +877,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self):
- builder = self.default_builder()
+ builder = self.default_builder
soup = self.soup(
"Findlady!", builder=builder)
magic_tag = Tag(soup, builder, 'magictag')
@@ -974,13 +912,6 @@ class TestTreeModification(SoupTest):
soup.a.append(soup.b)
self.assertEqual(data, soup.decode())
- def test_extend(self):
- data = ""
- soup = self.soup(data)
- l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
- soup.a.extend(l)
- self.assertEqual("", soup.decode())
-
def test_move_tag_to_beginning_of_parent(self):
data = ""
soup = self.soup(data)
@@ -1007,29 +938,6 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUXbarfooBAZ"))
- # Can't insert an element before itself.
- b = soup.b
- self.assertRaises(ValueError, b.insert_before, b)
-
- # Can't insert before if an element has no parent.
- b.extract()
- self.assertRaises(ValueError, b.insert_before, "nope")
-
- # Can insert an identical element
- soup = self.soup("")
- soup.a.insert_before(soup.new_tag("a"))
-
- def test_insert_multiple_before(self):
- soup = self.soup("foobar")
- soup.b.insert_before("BAZ", " ", "QUUX")
- soup.a.insert_before("QUUX", " ", "BAZ")
- self.assertEqual(
- soup.decode(), self.document_for("QUUX BAZfooBAZ QUUXbar"))
-
- soup.a.insert_before(soup.b, "FOO")
- self.assertEqual(
- soup.decode(), self.document_for("QUUX BAZbarFOOfooBAZ QUUX"))
-
def test_insert_after(self):
soup = self.soup("foobar")
soup.b.insert_after("BAZ")
@@ -1040,28 +948,6 @@ class TestTreeModification(SoupTest):
self.assertEqual(
soup.decode(), self.document_for("QUUXbarfooBAZ"))
- # Can't insert an element after itself.
- b = soup.b
- self.assertRaises(ValueError, b.insert_after, b)
-
- # Can't insert after if an element has no parent.
- b.extract()
- self.assertRaises(ValueError, b.insert_after, "nope")
-
- # Can insert an identical element
- soup = self.soup("")
- soup.a.insert_before(soup.new_tag("a"))
-
- def test_insert_multiple_after(self):
- soup = self.soup("foobar")
- soup.b.insert_after("BAZ", " ", "QUUX")
- soup.a.insert_after("QUUX", " ", "BAZ")
- self.assertEqual(
- soup.decode(), self.document_for("fooQUUX BAZbarBAZ QUUX"))
- soup.b.insert_after(soup.a, "FOO ")
- self.assertEqual(
- soup.decode(), self.document_for("QUUX BAZbarfooFOO BAZ QUUX"))
-
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
soup = self.soup("")
tag = soup.new_tag("a")
@@ -1225,7 +1111,7 @@ class TestTreeModification(SoupTest):