from __future__ import absolute_import, division, unicode_literals from html5lib import constants, parseFragment, serialize from html5lib.filters import sanitizer def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input) def sanitize_html(stream): parsed = parseFragment(stream) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) return serialized def test_should_handle_astral_plane_characters(): sanitized = sanitize_html("

𝒵 𝔸

") expected = '

\U0001d4b5 \U0001d538

' assert expected == sanitized def test_should_allow_relative_uris(): sanitized = sanitize_html('

') expected = '

' assert expected == sanitized def test_invalid_data_uri(): sanitized = sanitize_html('') expected = '' assert expected == sanitized def test_invalid_ipv6_url(): sanitized = sanitize_html('') expected = "" assert expected == sanitized def test_data_uri_disallowed_type(): sanitized = sanitize_html('') expected = "" assert expected == sanitized def test_sanitizer(): for ns, tag_name in sanitizer.allowed_elements: if ns != constants.namespaces["html"]: continue if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']: continue # TODO if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "foo <bad>bar</bad> baz", "<%s title='1'>foo bar baz" % (tag_name, tag_name)) elif tag_name == 'br': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "
foo <bad>bar</bad> baz
", "<%s title='1'>foo bar baz" % (tag_name, tag_name)) elif tag_name in constants.voidElements: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, "<%s title='1'>foo bar baz" % (tag_name, tag_name)) else: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\">foo <bad>bar</bad> baz" % (tag_name, tag_name), "<%s title='1'>foo bar baz" % (tag_name, tag_name)) for ns, attribute_name in sanitizer.allowed_attributes: if ns is not None: continue if attribute_name != attribute_name.lower(): continue # TODO if attribute_name == 'style': continue attribute_value = 'foo' if attribute_name in sanitizer.attr_val_is_uri: attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), "

foo bar baz

" % (attribute_name, attribute_value)) for protocol in sanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "foo" % (protocol, rest_of_uri), """foo""" % (protocol, rest_of_uri)) for protocol in sanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "foo" % (protocol, rest_of_uri), """foo""" % (protocol, rest_of_uri)) def test_lowercase_color_codes_in_style(): sanitized = sanitize_html("

") expected = '

' assert expected == sanitized def test_uppercase_color_codes_in_style(): sanitized = sanitize_html("

") expected = '

' assert expected == sanitized