from __future__ import absolute_import, division, unicode_literals
from html5lib import constants, parseFragment, serialize
from html5lib.filters import sanitizer
def runSanitizerTest(_, expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
def sanitize_html(stream):
parsed = parseFragment(stream)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
return serialized
def test_should_handle_astral_plane_characters():
sanitized = sanitize_html("
𝒵 𝔸
")
expected = '\U0001d4b5 \U0001d538
'
assert expected == sanitized
def test_should_allow_relative_uris():
sanitized = sanitize_html('
')
expected = '
'
assert expected == sanitized
def test_invalid_data_uri():
sanitized = sanitize_html('')
expected = ''
assert expected == sanitized
def test_invalid_ipv6_url():
sanitized = sanitize_html('')
expected = ""
assert expected == sanitized
def test_data_uri_disallowed_type():
sanitized = sanitize_html('')
expected = ""
assert expected == sanitized
def test_sanitizer():
for ns, tag_name in sanitizer.allowed_elements:
if ns != constants.namespaces["html"]:
continue
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'select']:
continue # TODO
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"foo <bad>bar</bad> baz",
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name))
elif tag_name == 'br':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"
foo <bad>bar</bad> baz
",
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name))
elif tag_name in constants.voidElements:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name,
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name))
else:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo <bad>bar</bad> baz%s>" % (tag_name, tag_name),
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name))
for ns, attribute_name in sanitizer.allowed_attributes:
if ns is not None:
continue
if attribute_name != attribute_name.lower():
continue # TODO
if attribute_name == 'style':
continue
attribute_value = 'foo'
if attribute_name in sanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
"foo <bad>bar</bad> baz
" % (attribute_name, attribute_value),
"foo bar baz
" % (attribute_name, attribute_value))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"foo" % (protocol, rest_of_uri),
"""foo""" % (protocol, rest_of_uri))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"foo" % (protocol, rest_of_uri),
"""foo""" % (protocol, rest_of_uri))
def test_lowercase_color_codes_in_style():
sanitized = sanitize_html("")
expected = ''
assert expected == sanitized
def test_uppercase_color_codes_in_style():
sanitized = sanitize_html("")
expected = ''
assert expected == sanitized