# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import pickle
import copy
import functools
import unittest
import warnings
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
Doctype,
PYTHON_SPECIFIC_ENCODINGS,
SoupStrainer,
Script,
Stylesheet,
Tag
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string
A tag
A tag that supposedly has contents.
AT&T
This numeric entity is missing the final semicolon:
"""
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
# Verify that the documents come out the same.
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
# Also run some checks on the BeautifulSoup object itself:
# Verify that every tag that was opened was eventually closed.
# There are no tags in the open tag counter.
assert all(v==0 for v in list(obj.open_tag_counter.values()))
# The only tag in the tag stack is the one for the root
# document.
self.assertEqual(
[obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
)
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
self.assertEqual(e, earlier.next_element)
self.assertEqual(earlier, e.previous_element)
earlier = e
def linkage_validator(self, el, _recursive_call=False):
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert el.previous_element is None,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert el.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert el.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert el.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert child.previous_element is el,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert child.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert child.previous_sibling is el.contents[idx - 1],\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert el.contents[idx - 1].next_sibling is child,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert child.previous_element is last_child,\
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert last_child.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
# A bubbled up descendant should have no next siblings
assert descendant.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert child.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target = el
while True:
if target is None:
assert child.next_element is None, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert child.next_element is target.next_sibling, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
class TreeBuilderSmokeTest(object):
# Tests that are common to HTML and XML tree builders.
def test_fuzzed_input(self):
# This test centralizes in one place the various fuzz tests
# for Beautiful Soup created by the oss-fuzz project.
# These strings superficially resemble markup, but they
# generally can't be parsed into anything. The best we can
# hope for is that parsing these strings won't crash the
# parser.
#
# n.b. This markup is commented out because these fuzz tests
# _do_ crash the parser. However the crashes are due to bugs
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
# bugs!
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://bugs.python.org/issue37747
#
#b'\nSome CSS"
)
assert isinstance(soup.style.string, Stylesheet)
assert isinstance(soup.script.string, Script)
soup = self.soup(
""
)
assert isinstance(soup.style.string, Stylesheet)
# The contents of the style tag resemble an HTML comment, but
# it's not treated as a comment.
self.assertEqual("", soup.style.string)
assert isinstance(soup.style.string, Stylesheet)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("foo")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
doctype_str
)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
"""Generate and parse a document with the given doctype."""
doctype = '' % (doctype_string, doctype_fragment)
markup = doctype + '\n
foo
'
soup = self.soup(markup)
return doctype.encode("utf8"), soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_mixed_case_doctype(self):
# A lowercase or mixed-case doctype becomes a Doctype.
for doctype_fragment in ("doctype", "DocType"):
doctype_str, soup = self._document_with_doctype(
"html", doctype_fragment
)
# Make sure a Doctype object was created and that the DOCTYPE
# is uppercase.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, "html")
self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
b""
)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""
Hello.
Goodbye.
"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_namespaced_html(self):
"""When a namespaced XML document is parsed as HTML it should
be treated as HTML with weird tag names.
"""
markup = b"""content"""
soup = self.soup(markup)
self.assertEqual(2, len(soup.find_all("ns1:foo")))
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = """"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
markup = b""""""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A
tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("
")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "")
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assertSoupEquals("
", "
")
self.assertSoupEquals("", "")
self.assertSoupEquals(" ", " ")
def test_br_is_always_empty_element_tag(self):
"""A tag is designated as an empty-element tag.
Some parsers treat as one tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup(" ")
self.assertTrue(soup.br.is_empty_element)
self.assertEqual(str(soup.br), " ")
def test_nested_formatting_elements(self):
self.assertSoupEquals("")
def test_double_head(self):
html = '''
Ordinary HEAD element test
Hello, world!
'''
soup = self.soup(html)
self.assertEqual("text/javascript", soup.find('script')['type'])
def test_comment(self):
# Comments are represented as Comment objects.
markup = "
foobaz
"
self.assertSoupEquals(markup)
soup = self.soup(markup)
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in