"""Tests to ensure that the html.parser tree builder generates good
trees."""
from pdb import set_trace
import pickle
import warnings
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("foo")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('
', "
")
self.assertSoupEquals('', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo bar", "foo &# bar")
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n
\n\n\ntext
"
soup = self.soup(markup)
self.assertEqual(2, soup.p.sourceline)
self.assertEqual(3, soup.p.sourcepos)
self.assertEqual("sourceline", soup.p.find('sourceline').name)
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
def test_on_duplicate_attribute(self):
# The html.parser tree builder has a variety of ways of
# handling a tag that contains the same attribute multiple times.
markup = ''
# If you don't provide any particular value for
# on_duplicate_attribute, later values replace earlier values.
soup = self.soup(markup)
self.assertEqual("url3", soup.a['href'])
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
# You can also get this behavior explicitly.
def assert_attribute(on_duplicate_attribute, expected):
soup = self.soup(
markup, on_duplicate_attribute=on_duplicate_attribute
)
self.assertEqual(expected, soup.a['href'])
# Verify that non-duplicate attributes are treated normally.
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
assert_attribute(None, "url3")
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
# You can ignore subsequent values in favor of the first.
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
# And you can pass in a callable that does whatever you want.
def accumulate(attrs, key, value):
if not isinstance(attrs[key], list):
attrs[key] = [attrs[key]]
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
def test_html5_attributes(self):
# The html.parser TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
for input_element, output_unicode, output_element in (
("⇄", '\u21c4', b'⇄'),
('⊧', '\u22a7', b'⊧'),
('𝔑', '\U0001d511', b'𝔑'),
('≧̸', '\u2267\u0338', b'≧̸'),
('¬', '\xac', b'¬'),
('⫬', '\u2aec', b'⫬'),
('"', '"', b'"'),
('∴', '\u2234', b'∴'),
('∴', '\u2234', b'∴'),
('∴', '\u2234', b'∴'),
("fj", 'fj', b'fj'),
("⊔", '\u2294', b'⊔'),
("⊔︀", '\u2294\ufe00', b'⊔︀'),
("'", "'", b"'"),
("|", "|", b"|"),
):
markup = '%s
' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"%s
" % output_unicode.encode("utf8")
self.assertEqual(without_element, expect)
with_element = div.encode(formatter="html")
expect = b"%s
" % output_element
self.assertEqual(with_element, expect)
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)